[Mesa-dev] [PATCH] nv50/ir, nvc0: use constant buffers for compute when possible on Kepler+

Fri Aug 3 22:04:55 UTC 2018

On Fri, Aug 3, 2018 at 5:57 PM, Rhys Perry <pendingchaos02 at gmail.com> wrote:
> Previously, UBOs were implemented using global memory loads for compute on
> Kepler+ because it only supported 8 constant buffers on compute. This
> required bounds checking and expensive load instructions.
>
> However 6 of the constant buffer bindings were left unused, this uses them
> instead of loading from global memory in the shader for the first 6
> non-user constant buffers when possible.
>
> total instructions in shared programs : 5787979 -> 5748677 (-0.68%)
> total gprs used in shared programs    : 669901 -> 669373 (-0.08%)
> total shared used in shared programs  : 548832 -> 548832 (0.00%)
> total local used in shared programs   : 21068 -> 21064 (-0.02%)
>
>                 local     shared        gpr       inst      bytes
>     helped           1           0         152         274         274
>       hurt           0           0           0           0           0
>
> Signed-off-by: Rhys Perry <pendingchaos02 at gmail.com>
> ---
>  .../nouveau/codegen/nv50_ir_lowering_nvc0.cpp      | 18 +++++++-------
>  src/gallium/drivers/nouveau/nvc0/nve4_compute.c    | 28 ++++++++++++++++++++++
>  2 files changed, 36 insertions(+), 10 deletions(-)
>
> diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
> index 1410cf26c8..0fba96f261 100644
> --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
> +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
> @@ -2437,18 +2437,16 @@ NVC0LoweringPass::handleLDST(Instruction *i)
>           assert(prog->getType() != Program::TYPE_FRAGMENT); // INTERP
>        }
>     } else if (i->src(0).getFile() == FILE_MEMORY_CONST) {
> +      int8_t fileIndex = i->getSrc(0)->reg.fileIndex - 1;
> +      Value *ind = i->getIndirect(0, 1);
> +
>        if (targ->getChipset() >= NVISA_GK104_CHIPSET &&
> -          prog->getType() == Program::TYPE_COMPUTE) {
> +          prog->getType() == Program::TYPE_COMPUTE &&
> +          (fileIndex >= 6 || ind)) {
>           // The launch descriptor only allows to set up 8 CBs, but OpenGL
> -         // requires at least 12 UBOs. To bypass this limitation, we store the
> -         // addrs into the driver constbuf and we directly load from the global
> -         // memory.
> -         int8_t fileIndex = i->getSrc(0)->reg.fileIndex - 1;
> -         Value *ind = i->getIndirect(0, 1);
> -
> -         if (!ind && fileIndex == -1)
> -            return;
> -
> +         // requires at least 12 UBOs. To bypass this limitation, for constant
> +         // buffers 7+, we store the addrs into the driver constbuf and we
> +         // directly load from the global memory.
>           if (ind) {
>              // Clamp the UBO index when an indirect access is used to avoid
>              // loading information from the wrong place in the driver cb.
> diff --git a/src/gallium/drivers/nouveau/nvc0/nve4_compute.c b/src/gallium/drivers/nouveau/nvc0/nve4_compute.c
> index 28460f8cbe..8aa8d4936f 100644
> --- a/src/gallium/drivers/nouveau/nvc0/nve4_compute.c
> +++ b/src/gallium/drivers/nouveau/nvc0/nve4_compute.c
> @@ -551,6 +551,30 @@ nve4_compute_derive_cache_split(struct nvc0_context *nvc0, uint32_t shared_size)
>     return NVC1_3D_CACHE_SPLIT_16K_SHARED_48K_L1;
>  }
>
> +static void
> +nve4_compute_setup_buf_cb(struct nvc0_context *nvc0, bool gp100, void *desc)
> +{
> +   // only user constant buffers 1-6 can be put in the descriptor, the rest are
> +   // loaded through global memory
> +   for (int i = 1; i <= 6; i++) {
> +      if (nvc0->constbuf[5][i].user || !nvc0->constbuf[5][i].u.buf)
> +         continue;
> +
> +      struct nv04_resource *res =
> +         nv04_resource(nvc0->constbuf[5][i].u.buf);
> +
> +      uint32_t base = res->offset + nvc0->constbuf[5][i].offset;

No compiler error on this one? Ouch. This obviously needs to be a uint64_t.

> +      uint32_t size = nvc0->constbuf[5][i].size;
> +      if (gp100)
> +         gp100_cp_launch_desc_set_cb(desc, i, res->bo, base, size);
> +      else
> +         nve4_cp_launch_desc_set_cb(desc, i, res->bo, base, size);
> +   }
> +
> +   // there is no need to do FLUSH(NVE4_COMPUTE_FLUSH_CB) because
> +   // nve4_compute_upload_input() does it later
> +}
> +
>  static void
>  nve4_compute_setup_launch_desc(struct nvc0_context *nvc0,
>                                 struct nve4_cp_launch_desc *desc,
> @@ -588,6 +612,8 @@ nve4_compute_setup_launch_desc(struct nvc0_context *nvc0,
>     }
>     nve4_cp_launch_desc_set_cb(desc, 7, screen->uniform_bo,
>                                NVC0_CB_AUX_INFO(5), 1 << 11);
> +
> +   nve4_compute_setup_buf_cb(nvc0, false, desc);
>  }
>
>  static void
> @@ -626,6 +652,8 @@ gp100_compute_setup_launch_desc(struct nvc0_context *nvc0,
>     }
>     gp100_cp_launch_desc_set_cb(desc, 7, screen->uniform_bo,
>                                 NVC0_CB_AUX_INFO(5), 1 << 11);
> +
> +   nve4_compute_setup_buf_cb(nvc0, true, desc);
>  }
>
>  static inline void *
> --
> 2.14.4
>