[Mesa-dev] [PATCH] draw: improve vertex fetch (v2)

Jose Fonseca jfonseca at vmware.com
Tue Oct 18 22:13:09 UTC 2016


On 15/10/16 02:54, sroland at vmware.com wrote:
> From: Roland Scheidegger <sroland at vmware.com>
>
> The per-element fetch has quite some calculations which are constant,
> these can be moved outside both the per-element as well as the main
> shader loop (llvm can figure out it's constant mostly on its own, however
> this can have a significant compile time cost).
> Similarly, it looks easier swapping the fetch loops (outer loop per attrib,
> inner loop filling up the per vertex elements - this way the aos->soa
> conversion also can be done per attrib and not just at the end though again
> this doesn't really make much of a difference in the generated code). (This
> would also make it possible to vectorize the calculations leading to the
> fetches.)
> There's also some minimal change simplifying the overflow math slightly.
> All in all, the generated code seems to look slightly simpler (depending
> on the actual vs), but more importantly I've seen a significant reduction
> in compile times for some vs (albeit with old (3.3) llvm version, and the
> time reduction is only really for the optimizations run on the IR).
> v2: adapt to other draw change.
>
> No changes with piglit.
> ---
>  src/gallium/auxiliary/draw/draw_llvm.c             | 190 +++++++++++----------
>  .../auxiliary/gallivm/lp_bld_arit_overflow.c       |  24 +++
>  .../auxiliary/gallivm/lp_bld_arit_overflow.h       |   6 +
>  3 files changed, 134 insertions(+), 86 deletions(-)
>
> diff --git a/src/gallium/auxiliary/draw/draw_llvm.c b/src/gallium/auxiliary/draw/draw_llvm.c
> index 3b56856..2f82d9d 100644
> --- a/src/gallium/auxiliary/draw/draw_llvm.c
> +++ b/src/gallium/auxiliary/draw/draw_llvm.c
> @@ -659,85 +659,42 @@ generate_vs(struct draw_llvm_variant *variant,
>  static void
>  generate_fetch(struct gallivm_state *gallivm,
>                 struct draw_context *draw,
> -               LLVMValueRef vbuffers_ptr,
> +               const struct util_format_description *format_desc,
> +               LLVMValueRef vb_stride,
> +               LLVMValueRef stride_fixed,
> +               LLVMValueRef map_ptr,
> +               LLVMValueRef buffer_size_adj,
> +               LLVMValueRef ofbit,
>                 LLVMValueRef *res,
> -               struct pipe_vertex_element *velem,
> -               LLVMValueRef vbuf,
> -               LLVMValueRef index,
> -               LLVMValueRef instance_id,
> -               LLVMValueRef start_instance)
> +               LLVMValueRef index)
>  {
> -   const struct util_format_description *format_desc =
> -      util_format_description(velem->src_format);
>     LLVMValueRef zero = LLVMConstNull(LLVMInt32TypeInContext(gallivm->context));
>     LLVMBuilderRef builder = gallivm->builder;
> -   LLVMValueRef indices =
> -      LLVMConstInt(LLVMInt64TypeInContext(gallivm->context),
> -                   velem->vertex_buffer_index, 0);
> -   LLVMValueRef vbuffer_ptr = LLVMBuildGEP(builder, vbuffers_ptr,
> -                                           &indices, 1, "");
> -   LLVMValueRef vb_stride = draw_jit_vbuffer_stride(gallivm, vbuf);
> -   LLVMValueRef vb_buffer_offset = draw_jit_vbuffer_offset(gallivm, vbuf);
> -   LLVMValueRef map_ptr = draw_jit_dvbuffer_map(gallivm, vbuffer_ptr);
> -   LLVMValueRef buffer_size = draw_jit_dvbuffer_size(gallivm, vbuffer_ptr);
>     LLVMValueRef stride;
>     LLVMValueRef buffer_overflowed;
> -   LLVMValueRef needed_buffer_size;
>     LLVMValueRef temp_ptr =
>        lp_build_alloca(gallivm,
>                        lp_build_vec_type(gallivm, lp_float32_vec4_type()), "");
> -   LLVMValueRef ofbit = NULL;
>     struct lp_build_if_state if_ctx;
>
> -   if (velem->src_format == PIPE_FORMAT_NONE) {
> +   if (format_desc->format == PIPE_FORMAT_NONE) {
>        *res = lp_build_const_vec(gallivm, lp_float32_vec4_type(), 0);
>        return;
>     }
>
> -   if (velem->instance_divisor) {
> -      /* Index is equal to the start instance plus the number of current
> -       * instance divided by the divisor. In this case we compute it as:
> -       * index = start_instance + (instance_id  / divisor)
> -       */
> -      LLVMValueRef current_instance;
> -      current_instance = LLVMBuildUDiv(builder, instance_id,
> -                                       lp_build_const_int32(gallivm, velem->instance_divisor),
> -                                       "instance_divisor");
> -      index = lp_build_uadd_overflow(gallivm, start_instance,
> -                                     current_instance, &ofbit);
> -   }
> -
>     stride = lp_build_umul_overflow(gallivm, vb_stride, index, &ofbit);
> -   stride = lp_build_uadd_overflow(gallivm, stride, vb_buffer_offset, &ofbit);
> -   stride = lp_build_uadd_overflow(
> -      gallivm, stride,
> -      lp_build_const_int32(gallivm, velem->src_offset), &ofbit);
> -   needed_buffer_size = lp_build_uadd_overflow(
> -      gallivm, stride,
> -      lp_build_const_int32(gallivm,
> -                           util_format_get_blocksize(velem->src_format)),
> -      &ofbit);
> +   stride = lp_build_uadd_overflow(gallivm, stride, stride_fixed, &ofbit);
>
>     buffer_overflowed = LLVMBuildICmp(builder, LLVMIntUGT,
> -                                     needed_buffer_size, buffer_size,
> +                                     stride, buffer_size_adj,
>                                       "buffer_overflowed");
>     buffer_overflowed = LLVMBuildOr(builder, buffer_overflowed, ofbit, "");
> -#if 0
> -   lp_build_printf(gallivm, "vbuf index = %u, vb_stride is %u\n",
> -                   index, vb_stride);
> -   lp_build_printf(gallivm, "   vb_buffer_offset = %u, src_offset is %u\n",
> -                   vb_buffer_offset,
> -                   lp_build_const_int32(gallivm, velem->src_offset));
> -   lp_build_print_value(gallivm, "   blocksize = ",
> -                        lp_build_const_int32(
> -                           gallivm,
> -                           util_format_get_blocksize(velem->src_format)));
> -   lp_build_printf(gallivm, "   instance_id = %u\n", instance_id);
> -   lp_build_printf(gallivm, "   stride = %u\n", stride);
> -   lp_build_printf(gallivm, "   buffer size = %u\n", buffer_size);
> -   lp_build_printf(gallivm, "   needed_buffer_size = %u\n", needed_buffer_size);
> -   lp_build_print_value(gallivm, "   buffer overflowed = ", buffer_overflowed);
> -#endif
> +
> +   if (0) {
> +      lp_build_printf(gallivm, "   stride = %u\n", stride);
> +      lp_build_printf(gallivm, "   buffer size adj = %u\n", buffer_size_adj);
> +      lp_build_print_value(gallivm, "   buffer overflowed = ", buffer_overflowed);
> +   }
>
>     lp_build_if(&if_ctx, gallivm, buffer_overflowed);
>     {
> @@ -766,36 +723,34 @@ generate_fetch(struct gallivm_state *gallivm,
>
>  static void
>  convert_to_soa(struct gallivm_state *gallivm,
> -               LLVMValueRef (*src_aos)[LP_MAX_VECTOR_WIDTH / 32],
> +               LLVMValueRef src_aos[LP_MAX_VECTOR_WIDTH / 32],
>                 LLVMValueRef (*dst_soa)[TGSI_NUM_CHANNELS],
> -               unsigned num_attribs, const struct lp_type soa_type)
> +               unsigned attrib, const struct lp_type soa_type)
>  {
> -   unsigned i, j, k;
> +   unsigned j, k;
>     struct lp_type aos_channel_type = soa_type;
>
> +   LLVMValueRef aos_channels[TGSI_NUM_CHANNELS];
> +   unsigned pixels_per_channel = soa_type.length / TGSI_NUM_CHANNELS;
> +
>     debug_assert(TGSI_NUM_CHANNELS == 4);
>     debug_assert((soa_type.length % TGSI_NUM_CHANNELS) == 0);
>
>     aos_channel_type.length >>= 1;
>
> -   for (i = 0; i < num_attribs; ++i) {
> -      LLVMValueRef aos_channels[TGSI_NUM_CHANNELS];
> -      unsigned pixels_per_channel = soa_type.length / TGSI_NUM_CHANNELS;
> -
> -      for (j = 0; j < TGSI_NUM_CHANNELS; ++j) {
> -         LLVMValueRef channel[LP_MAX_VECTOR_LENGTH] = { 0 };
> +   for (j = 0; j < TGSI_NUM_CHANNELS; ++j) {
> +      LLVMValueRef channel[LP_MAX_VECTOR_LENGTH] = { 0 };
>
> -         assert(pixels_per_channel <= LP_MAX_VECTOR_LENGTH);
> -
> -         for (k = 0; k < pixels_per_channel; ++k) {
> -            channel[k] = src_aos[i][j + TGSI_NUM_CHANNELS * k];
> -         }
> +      assert(pixels_per_channel <= LP_MAX_VECTOR_LENGTH);
>
> -         aos_channels[j] = lp_build_concat(gallivm, channel, aos_channel_type, pixels_per_channel);
> +      for (k = 0; k < pixels_per_channel; ++k) {
> +         channel[k] = src_aos[j + TGSI_NUM_CHANNELS * k];
>        }
>
> -      lp_build_transpose_aos(gallivm, soa_type, aos_channels, dst_soa[i]);
> +      aos_channels[j] = lp_build_concat(gallivm, channel, aos_channel_type, pixels_per_channel);
>     }
> +
> +   lp_build_transpose_aos(gallivm, soa_type, aos_channels, dst_soa[attrib]);
>  }
>
>
> @@ -1549,6 +1504,13 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant,
>     LLVMValueRef io_ptr, vbuffers_ptr, vb_ptr;
>     LLVMValueRef zero = lp_build_const_int32(gallivm, 0);
>     LLVMValueRef one = lp_build_const_int32(gallivm, 1);
> +   LLVMValueRef vb_stride[PIPE_MAX_SHADER_INPUTS];
> +   LLVMValueRef map_ptr[PIPE_MAX_SHADER_INPUTS];
> +   LLVMValueRef buffer_size_adj[PIPE_MAX_SHADER_INPUTS];
> +   LLVMValueRef stride_fixed[PIPE_MAX_SHADER_INPUTS];
> +   LLVMValueRef ofbit[PIPE_MAX_SHADER_INPUTS];
> +   LLVMValueRef instance_index[PIPE_MAX_SHADER_INPUTS];
> +
>     struct draw_context *draw = llvm->draw;
>     const struct tgsi_shader_info *vs_info = &draw->vs.vertex_shader->info;
>     unsigned i, j;
> @@ -1693,14 +1655,67 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant,
>
>     fetch_max = LLVMBuildSub(builder, end, one, "fetch_max");
>
> +   /*
> +    * Pre-calculate everything which is constant per shader invocation.
> +    */
> +   for (j = 0; j < draw->pt.nr_vertex_elements; ++j) {
> +      LLVMValueRef vb_buffer_offset, buffer_size;
> +      LLVMValueRef vb_info, vbuffer_ptr;
> +      struct pipe_vertex_element *velem = &draw->pt.vertex_element[j];
> +      LLVMValueRef vb_index =
> +         lp_build_const_int32(gallivm, velem->vertex_buffer_index);
> +      LLVMValueRef bsize = lp_build_const_int32(gallivm,
> +                                                util_format_get_blocksize(velem->src_format));
> +      LLVMValueRef src_offset = lp_build_const_int32(gallivm,
> +                                                     velem->src_offset);
> +
> +      vbuffer_ptr = LLVMBuildGEP(builder, vbuffers_ptr, &vb_index, 1, "");
> +      vb_info = LLVMBuildGEP(builder, vb_ptr, &vb_index, 1, "");
> +      vb_stride[j] = draw_jit_vbuffer_stride(gallivm, vb_info);
> +      vb_buffer_offset = draw_jit_vbuffer_offset(gallivm, vb_info);
> +      map_ptr[j] = draw_jit_dvbuffer_map(gallivm, vbuffer_ptr);
> +      buffer_size = draw_jit_dvbuffer_size(gallivm, vbuffer_ptr);
> +
> +      ofbit[j] = NULL;
> +      stride_fixed[j] = lp_build_uadd_overflow(gallivm, vb_buffer_offset,
> +                                               src_offset, &ofbit[j]);
> +      buffer_size_adj[j] = lp_build_usub_overflow(gallivm, buffer_size, bsize,
> +                                                   &ofbit[j]);
> +
> +      if (velem->instance_divisor) {
> +         /* Index is equal to the start instance plus the number of current
> +          * instance divided by the divisor. In this case we compute it as:
> +          * index = start_instance + (instance_id  / divisor)
> +          */
> +         LLVMValueRef current_instance;
> +         current_instance = LLVMBuildUDiv(builder, system_values.instance_id,
> +                                          lp_build_const_int32(gallivm,
> +                                                               velem->instance_divisor),
> +                                          "instance_divisor");
> +         instance_index[j] = lp_build_uadd_overflow(gallivm, start_instance,
> +                                                    current_instance, &ofbit[j]);
> +      }
> +
> +      if (0) {
> +         lp_build_printf(gallivm, "vbuf index = %u, vb_stride is %u\n",
> +                         vb_index, vb_stride[j]);
> +         lp_build_printf(gallivm, "   vb_buffer_offset = %u, src_offset is %u\n",
> +                         vb_buffer_offset, src_offset);
> +         lp_build_print_value(gallivm, "   blocksize = ", bsize);
> +         lp_build_printf(gallivm, "   instance_id = %u\n", system_values.instance_id);
> +         lp_build_printf(gallivm, "   buffer size = %u\n", buffer_size);
> +      }
> +   }
> +
>     lp_build_loop_begin(&lp_loop, gallivm, zero);
>     {
>        LLVMValueRef inputs[PIPE_MAX_SHADER_INPUTS][TGSI_NUM_CHANNELS];
> -      LLVMValueRef aos_attribs[PIPE_MAX_SHADER_INPUTS][LP_MAX_VECTOR_WIDTH / 32] = { { 0 } };
> +      LLVMValueRef aos_attribs[LP_MAX_VECTOR_WIDTH / 32] = { 0 };
>        LLVMValueRef io;
>        LLVMValueRef clipmask;   /* holds the clipmask value */
>        LLVMValueRef true_index_array = lp_build_zero(gallivm,
>                                                      lp_type_uint_vec(32, 32*vector_length));
> +      LLVMValueRef true_indices[LP_MAX_VECTOR_WIDTH / 32];
>        const LLVMValueRef (*ptr_aos)[TGSI_NUM_CHANNELS];
>
>        io_itr = lp_loop.counter;
> @@ -1760,22 +1775,25 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant,
>              lp_build_endif(&if_ctx);
>              true_index = LLVMBuildLoad(builder, index_ptr, "true_index");
>           }
> +         true_indices[i] = true_index;
>           true_index_array = LLVMBuildInsertElement(
>              gallivm->builder, true_index_array, true_index,
>              lp_build_const_int32(gallivm, i), "");
> +      }
> +
> +      for (j = 0; j < key->nr_vertex_elements; ++j) {
> +         struct pipe_vertex_element *velem = &draw->pt.vertex_element[j];
> +         const struct util_format_description *format_desc =
> +            util_format_description(velem->src_format);
>
> -         for (j = 0; j < key->nr_vertex_elements; ++j) {
> -            struct pipe_vertex_element *velem = &draw->pt.vertex_element[j];
> -            LLVMValueRef vb_index =
> -               lp_build_const_int32(gallivm, velem->vertex_buffer_index);
> -            LLVMValueRef vb = LLVMBuildGEP(builder, vb_ptr, &vb_index, 1, "");
> -            generate_fetch(gallivm, draw, vbuffers_ptr,
> -                           &aos_attribs[j][i], velem, vb, true_index,
> -                           system_values.instance_id, start_instance);
> +         for (i = 0; i < vector_length; ++i) {
> +            generate_fetch(gallivm, draw, format_desc,
> +                           vb_stride[j], stride_fixed[j], map_ptr[j],
> +                           buffer_size_adj[j], ofbit[j], &aos_attribs[i],
> +                           velem->instance_divisor ? instance_index[j] : true_indices[i]);
>           }
> +         convert_to_soa(gallivm, aos_attribs, inputs, j, vs_type);
>        }
> -      convert_to_soa(gallivm, aos_attribs, inputs,
> -                     key->nr_vertex_elements, vs_type);
>
>        /* In the paths with elts vertex id has to be unaffected by the
>         * index bias and because indices inside our elements array have
> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit_overflow.c b/src/gallium/auxiliary/gallivm/lp_bld_arit_overflow.c
> index 91247fd..152ad57 100644
> --- a/src/gallium/auxiliary/gallivm/lp_bld_arit_overflow.c
> +++ b/src/gallium/auxiliary/gallivm/lp_bld_arit_overflow.c
> @@ -127,6 +127,30 @@ lp_build_uadd_overflow(struct gallivm_state *gallivm,
>  }
>
>  /**
> + * Performs unsigned subtraction of two integers and reports
> + * overflow if detected.
> + *
> + * The values @a and @b must be of the same integer type. If
> + * an overflow is detected the IN/OUT @ofbit parameter is used:
> + * - if it's pointing to a null value, the overflow bit is simply
> + *   stored inside the variable it's pointing to,
> + * - if it's pointing to a valid value, then that variable,
> + *   which must be of i1 type, is ORed with the newly detected
> + *   overflow bit. This is done to allow chaining of a number of
> + *   overflow functions together without having to test the
> + *   overflow bit after every single one.
> + */
> +LLVMValueRef
> +lp_build_usub_overflow(struct gallivm_state *gallivm,
> +                       LLVMValueRef a,
> +                       LLVMValueRef b,
> +                       LLVMValueRef *ofbit)
> +{
> +   return build_binary_int_overflow(gallivm, "llvm.usub.with.overflow",
> +                                    a, b, ofbit);
> +}
> +
> +/**
>   * Performs unsigned multiplication of  two integers and
>   * reports overflow if detected.
>   *
> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit_overflow.h b/src/gallium/auxiliary/gallivm/lp_bld_arit_overflow.h
> index 8c35a04..34ce00e 100644
> --- a/src/gallium/auxiliary/gallivm/lp_bld_arit_overflow.h
> +++ b/src/gallium/auxiliary/gallivm/lp_bld_arit_overflow.h
> @@ -49,6 +49,12 @@ lp_build_uadd_overflow(struct gallivm_state *gallivm,
>                         LLVMValueRef *ofbit);
>
>  LLVMValueRef
> +lp_build_usub_overflow(struct gallivm_state *gallivm,
> +                       LLVMValueRef a,
> +                       LLVMValueRef b,
> +                       LLVMValueRef *ofbit);
> +
> +LLVMValueRef
>  lp_build_umul_overflow(struct gallivm_state *gallivm,
>                         LLVMValueRef a,
>                         LLVMValueRef b,
>

Looks good to me AFAICT.

Reviewed-by: Jose Fonseca <jfonseca at vmware.com>


More information about the mesa-dev mailing list