[Mesa-dev] [PATCH 03/11] i965: Select ranges of UBO data to be uploaded as push constants.

Matt Turner mattst88 at gmail.com
Mon Jul 10 18:30:12 UTC 2017


On Thu, Jul 6, 2017 at 5:22 PM, Kenneth Graunke <kenneth at whitecape.org> wrote:
> This adds a NIR pass that decides which portions of UBOS we should
> upload as push constants, rather than pull constants.
> ---
>  src/intel/Makefile.sources                      |   1 +
>  src/intel/compiler/brw_compiler.h               |  11 +
>  src/intel/compiler/brw_nir.h                    |   4 +
>  src/intel/compiler/brw_nir_analyze_ubo_ranges.c | 271 ++++++++++++++++++++++++
>  src/mesa/drivers/dri/i965/brw_gs.c              |   2 +
>  src/mesa/drivers/dri/i965/brw_tcs.c             |   2 +
>  src/mesa/drivers/dri/i965/brw_tes.c             |   2 +
>  src/mesa/drivers/dri/i965/brw_vs.c              |   2 +
>  src/mesa/drivers/dri/i965/brw_wm.c              |   2 +
>  9 files changed, 297 insertions(+)
>  create mode 100644 src/intel/compiler/brw_nir_analyze_ubo_ranges.c
>
> diff --git a/src/intel/Makefile.sources b/src/intel/Makefile.sources
> index b672e615c52..f0a8bf517a1 100644
> --- a/src/intel/Makefile.sources
> +++ b/src/intel/Makefile.sources
> @@ -73,6 +73,7 @@ COMPILER_FILES = \
>         compiler/brw_nir.h \
>         compiler/brw_nir.c \
>         compiler/brw_nir_analyze_boolean_resolves.c \
> +       compiler/brw_nir_analyze_ubo_ranges.c \
>         compiler/brw_nir_attribute_workarounds.c \
>         compiler/brw_nir_intrinsics.c \
>         compiler/brw_nir_opt_peephole_ffma.c \
> diff --git a/src/intel/compiler/brw_compiler.h b/src/intel/compiler/brw_compiler.h
> index e4c22e31177..d8e7717e867 100644
> --- a/src/intel/compiler/brw_compiler.h
> +++ b/src/intel/compiler/brw_compiler.h
> @@ -468,6 +468,15 @@ struct brw_image_param {
>   */
>  #define BRW_SHADER_TIME_STRIDE 64
>
> +struct brw_ubo_range
> +{
> +   // XXX: jason says that 255 won't be enough for vulkan - we may have
> +   // large amounts of UBOs in the future.  use uint16_t.
> +   uint8_t block;
> +   uint8_t start;
> +   uint8_t length;
> +};
> +
>  struct brw_stage_prog_data {
>     struct {
>        /** size of our binding table. */
> @@ -488,6 +497,8 @@ struct brw_stage_prog_data {
>        /** @} */
>     } binding_table;
>
> +   struct brw_ubo_range ubo_ranges[4];
> +
>     GLuint nr_params;       /**< number of float params/constants */
>     GLuint nr_pull_params;
>     unsigned nr_image_params;
> diff --git a/src/intel/compiler/brw_nir.h b/src/intel/compiler/brw_nir.h
> index 5d866b86ac8..560027c3662 100644
> --- a/src/intel/compiler/brw_nir.h
> +++ b/src/intel/compiler/brw_nir.h
> @@ -142,6 +142,10 @@ void brw_nir_setup_glsl_uniforms(nir_shader *shader,
>  void brw_nir_setup_arb_uniforms(nir_shader *shader, struct gl_program *prog,
>                                  struct brw_stage_prog_data *stage_prog_data);
>
> +void brw_nir_analyze_ubo_ranges(const struct brw_compiler *compiler,
> +                                nir_shader *nir,
> +                                struct brw_ubo_range out_ranges[4]);
> +
>  bool brw_nir_opt_peephole_ffma(nir_shader *shader);
>
>  #define BRW_NIR_FRAG_OUTPUT_INDEX_SHIFT 0
> diff --git a/src/intel/compiler/brw_nir_analyze_ubo_ranges.c b/src/intel/compiler/brw_nir_analyze_ubo_ranges.c
> new file mode 100644
> index 00000000000..3535e67758c
> --- /dev/null
> +++ b/src/intel/compiler/brw_nir_analyze_ubo_ranges.c
> @@ -0,0 +1,271 @@
> +/*
> + * Copyright © 2015 Intel Corporation
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice (including the next
> + * paragraph) shall be included in all copies or substantial portions of the
> + * Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
> + * IN THE SOFTWARE.
> + */
> +
> +#include "brw_nir.h"
> +#include "compiler/nir/nir.h"
> +#include "util/u_dynarray.h"
> +
> +/**
> + * \file brw_nir_analyze_ubo_ranges.c
> + *
> + * This pass decides which portions of UBOs to upload as push constants,
> + * so shaders can access them as part of the thread payload, rather than
> + * having to issue expensive memory reads to pull the data.
> + *
> + * The 3DSTATE_CONSTANT_* mechanism can push data from up to 4 different
> + * buffers, in GRF (256-bit/32-byte) units.
> + *
> + * To do this, we examine NIR load_ubo intrinsics, recording the number of
> + * loads at each offset.  We track offsets at a 32-byte granularity, so even
> + * fields with a bit of padding between them tend to fall into contiguous
> + * ranges.  We build a list of these ranges, tracking their "cost" (number
> + * of registers required) and "benefit" (number of pull loads eliminated
> + * by pushing the range).  We then sort the list to obtain the four best
> + * ranges (most benefit for the least cost).
> + */
> +
> +struct ubo_range_entry
> +{
> +   struct brw_ubo_range range;
> +   int benefit;
> +};
> +
> +static int
> +score(const struct ubo_range_entry *entry)
> +{
> +   return 2 * entry->benefit - entry->range.length;
> +}
> +
> +/**
> + * Compares score for two UBO range entries.
> + *
> + * For a descending qsort().
> + */
> +static int
> +cmp_ubo_range_entry(const void *va, const void *vb)
> +{
> +   const struct ubo_range_entry *a = va;
> +   const struct ubo_range_entry *b = vb;
> +
> +   /* Rank based on scores */
> +   int delta = score(b) - score(a);
> +
> +   /* Then use the UBO block index as a tie-breaker */
> +   if (delta == 0)
> +      delta = b->range.block - a->range.block;
> +
> +   /* Finally use the UBO offset as a second tie-breaker */
> +   if (delta == 0)
> +      delta = b->range.block - a->range.block;
> +
> +   return delta;
> +}
> +
> +struct ubo_block_info
> +{
> +   uint64_t offsets;
> +   uint8_t uses[64];
> +};
> +
> +struct ubo_analysis_state
> +{
> +   struct hash_table *blocks;
> +   bool uses_regular_uniforms;
> +};
> +
> +static struct ubo_block_info *
> +get_block_info(struct ubo_analysis_state *state, int block)
> +{
> +   uint32_t hash = block + 1;
> +   void *key = (void *) (uintptr_t) hash;
> +
> +   struct hash_entry *entry =
> +      _mesa_hash_table_search_pre_hashed(state->blocks, hash, key);
> +
> +   if (entry)
> +      return (struct ubo_block_info *) entry->data;
> +
> +   struct ubo_block_info *info =
> +      rzalloc(state->blocks, struct ubo_block_info);
> +   _mesa_hash_table_insert_pre_hashed(state->blocks, hash, key, info);
> +
> +   return info;
> +}
> +
> +static void
> +analyze_ubos_block(struct ubo_analysis_state *state, nir_block *block)
> +{
> +   nir_foreach_instr(instr, block) {
> +      if (instr->type != nir_instr_type_intrinsic)
> +         continue;
> +
> +      nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
> +      if (intrin->intrinsic == nir_intrinsic_load_uniform)
> +         state->uses_regular_uniforms = true;
> +
> +      if (intrin->intrinsic != nir_intrinsic_load_ubo)
> +         continue;
> +
> +      nir_const_value *block_const = nir_src_as_const_value(intrin->src[0]);
> +      nir_const_value *offset_const = nir_src_as_const_value(intrin->src[1]);
> +
> +      if (block_const && offset_const) {
> +         const int block = block_const->u32[0];
> +         const int offset = offset_const->u32[0] / 32;
> +
> +         /* Won't fit in our bitfield */
> +         if (offset >= 64)
> +            continue;
> +
> +         /* TODO: should we count uses in loops as higher benefit? */
> +
> +         struct ubo_block_info *info = get_block_info(state, block);
> +         info->offsets |= 1ull << offset;
> +         info->uses[offset]++;
> +      }
> +   }
> +}
> +
> +static void
> +print_ubo_entry(FILE *file,
> +                const struct ubo_range_entry *entry,
> +                struct ubo_analysis_state *state)
> +{
> +   struct ubo_block_info *info = get_block_info(state, entry->range.block);
> +
> +   fprintf(file,
> +           "block %2d, start %2d, length %2d, bits = %zx, "
> +           "benefit %2d, cost %2d, score = %2d\n",
> +           entry->range.block, entry->range.start, entry->range.length,
> +           info->offsets, entry->benefit, entry->range.length, score(entry));
> +}
> +
> +void
> +brw_nir_analyze_ubo_ranges(const struct brw_compiler *compiler,
> +                           nir_shader *nir,
> +                           struct brw_ubo_range out_ranges[4])
> +{
> +   const struct gen_device_info *devinfo = compiler->devinfo;
> +
> +   if (devinfo->gen <= 7 && !devinfo->is_haswell) {
> +      memset(out_ranges, 0, 4 * sizeof(struct brw_ubo_range));
> +      return;
> +   }
> +
> +   void *mem_ctx = ralloc_context(NULL);
> +
> +   struct ubo_analysis_state state = {
> +      .uses_regular_uniforms = false,
> +      .blocks =
> +         _mesa_hash_table_create(mem_ctx, NULL, _mesa_key_pointer_equal),
> +   };
> +
> +   /* Walk the IR, recording how many times each UBO block/offset is used. */
> +   nir_foreach_function(function, nir) {
> +      if (function->impl) {
> +         nir_foreach_block(block, function->impl) {
> +            analyze_ubos_block(&state, block);
> +         }
> +      }
> +   }
> +
> +   /* Find ranges. */
> +   struct util_dynarray ranges;
> +   util_dynarray_init(&ranges, mem_ctx);
> +
> +   struct hash_entry *entry;
> +   hash_table_foreach(state.blocks, entry) {
> +      const int b = entry->hash - 1;
> +      const struct ubo_block_info *info = entry->data;
> +      uint64_t offsets = info->offsets;
> +
> +      while (offsets != 0) {
> +         int first_bit = ffsll(offsets) - 1;

Okay, get the zero-indexed first set bit.

> +         int first_hole = ffsll(~offsets & ~((1ull << first_bit) - 1)) - 1;

~((1ull << first_bit) - 1) gives the mask of bits greater than or
equal to first_bit.

offsets is a bitmask indicating presence of data within a UBO block?
So ~offset is the bitmask of the padding, and AND'ing gives us a
bitmask of ???

I'm lost.

Do you just want to use __builtin_clz (or util_logbase2)?

> +         if (first_hole == -1) {
> +            first_hole = 64;
> +            offsets = 0;

Okay, so anyway, first_hole == -1 indicates the whole block is full,
in which case we want to consider the whole block starting at
offset=0. Wait, no. offsets is still as mask...

Need some help getting through this patch.


More information about the mesa-dev mailing list