[Mesa-dev] [PATCH] draw: improve vertex fetch (v2)
Jose Fonseca
jfonseca at vmware.com
Tue Oct 18 22:13:09 UTC 2016
On 15/10/16 02:54, sroland at vmware.com wrote:
> From: Roland Scheidegger <sroland at vmware.com>
>
> The per-element fetch has quite some calculations which are constant,
> these can be moved outside both the per-element as well as the main
> shader loop (llvm can figure out it's constant mostly on its own, however
> this can have a significant compile time cost).
> Similarly, it looks easier swapping the fetch loops (outer loop per attrib,
> inner loop filling up the per vertex elements - this way the aos->soa
> conversion also can be done per attrib and not just at the end though again
> this doesn't really make much of a difference in the generated code). (This
> would also make it possible to vectorize the calculations leading to the
> fetches.)
> There's also some minimal change simplifying the overflow math slightly.
> All in all, the generated code seems to look slightly simpler (depending
> on the actual vs), but more importantly I've seen a significant reduction
> in compile times for some vs (albeit with old (3.3) llvm version, and the
> time reduction is only really for the optimizations run on the IR).
> v2: adapt to other draw change.
>
> No changes with piglit.
> ---
> src/gallium/auxiliary/draw/draw_llvm.c | 190 +++++++++++----------
> .../auxiliary/gallivm/lp_bld_arit_overflow.c | 24 +++
> .../auxiliary/gallivm/lp_bld_arit_overflow.h | 6 +
> 3 files changed, 134 insertions(+), 86 deletions(-)
>
> diff --git a/src/gallium/auxiliary/draw/draw_llvm.c b/src/gallium/auxiliary/draw/draw_llvm.c
> index 3b56856..2f82d9d 100644
> --- a/src/gallium/auxiliary/draw/draw_llvm.c
> +++ b/src/gallium/auxiliary/draw/draw_llvm.c
> @@ -659,85 +659,42 @@ generate_vs(struct draw_llvm_variant *variant,
> static void
> generate_fetch(struct gallivm_state *gallivm,
> struct draw_context *draw,
> - LLVMValueRef vbuffers_ptr,
> + const struct util_format_description *format_desc,
> + LLVMValueRef vb_stride,
> + LLVMValueRef stride_fixed,
> + LLVMValueRef map_ptr,
> + LLVMValueRef buffer_size_adj,
> + LLVMValueRef ofbit,
> LLVMValueRef *res,
> - struct pipe_vertex_element *velem,
> - LLVMValueRef vbuf,
> - LLVMValueRef index,
> - LLVMValueRef instance_id,
> - LLVMValueRef start_instance)
> + LLVMValueRef index)
> {
> - const struct util_format_description *format_desc =
> - util_format_description(velem->src_format);
> LLVMValueRef zero = LLVMConstNull(LLVMInt32TypeInContext(gallivm->context));
> LLVMBuilderRef builder = gallivm->builder;
> - LLVMValueRef indices =
> - LLVMConstInt(LLVMInt64TypeInContext(gallivm->context),
> - velem->vertex_buffer_index, 0);
> - LLVMValueRef vbuffer_ptr = LLVMBuildGEP(builder, vbuffers_ptr,
> - &indices, 1, "");
> - LLVMValueRef vb_stride = draw_jit_vbuffer_stride(gallivm, vbuf);
> - LLVMValueRef vb_buffer_offset = draw_jit_vbuffer_offset(gallivm, vbuf);
> - LLVMValueRef map_ptr = draw_jit_dvbuffer_map(gallivm, vbuffer_ptr);
> - LLVMValueRef buffer_size = draw_jit_dvbuffer_size(gallivm, vbuffer_ptr);
> LLVMValueRef stride;
> LLVMValueRef buffer_overflowed;
> - LLVMValueRef needed_buffer_size;
> LLVMValueRef temp_ptr =
> lp_build_alloca(gallivm,
> lp_build_vec_type(gallivm, lp_float32_vec4_type()), "");
> - LLVMValueRef ofbit = NULL;
> struct lp_build_if_state if_ctx;
>
> - if (velem->src_format == PIPE_FORMAT_NONE) {
> + if (format_desc->format == PIPE_FORMAT_NONE) {
> *res = lp_build_const_vec(gallivm, lp_float32_vec4_type(), 0);
> return;
> }
>
> - if (velem->instance_divisor) {
> - /* Index is equal to the start instance plus the number of current
> - * instance divided by the divisor. In this case we compute it as:
> - * index = start_instance + (instance_id / divisor)
> - */
> - LLVMValueRef current_instance;
> - current_instance = LLVMBuildUDiv(builder, instance_id,
> - lp_build_const_int32(gallivm, velem->instance_divisor),
> - "instance_divisor");
> - index = lp_build_uadd_overflow(gallivm, start_instance,
> - current_instance, &ofbit);
> - }
> -
> stride = lp_build_umul_overflow(gallivm, vb_stride, index, &ofbit);
> - stride = lp_build_uadd_overflow(gallivm, stride, vb_buffer_offset, &ofbit);
> - stride = lp_build_uadd_overflow(
> - gallivm, stride,
> - lp_build_const_int32(gallivm, velem->src_offset), &ofbit);
> - needed_buffer_size = lp_build_uadd_overflow(
> - gallivm, stride,
> - lp_build_const_int32(gallivm,
> - util_format_get_blocksize(velem->src_format)),
> - &ofbit);
> + stride = lp_build_uadd_overflow(gallivm, stride, stride_fixed, &ofbit);
>
> buffer_overflowed = LLVMBuildICmp(builder, LLVMIntUGT,
> - needed_buffer_size, buffer_size,
> + stride, buffer_size_adj,
> "buffer_overflowed");
> buffer_overflowed = LLVMBuildOr(builder, buffer_overflowed, ofbit, "");
> -#if 0
> - lp_build_printf(gallivm, "vbuf index = %u, vb_stride is %u\n",
> - index, vb_stride);
> - lp_build_printf(gallivm, " vb_buffer_offset = %u, src_offset is %u\n",
> - vb_buffer_offset,
> - lp_build_const_int32(gallivm, velem->src_offset));
> - lp_build_print_value(gallivm, " blocksize = ",
> - lp_build_const_int32(
> - gallivm,
> - util_format_get_blocksize(velem->src_format)));
> - lp_build_printf(gallivm, " instance_id = %u\n", instance_id);
> - lp_build_printf(gallivm, " stride = %u\n", stride);
> - lp_build_printf(gallivm, " buffer size = %u\n", buffer_size);
> - lp_build_printf(gallivm, " needed_buffer_size = %u\n", needed_buffer_size);
> - lp_build_print_value(gallivm, " buffer overflowed = ", buffer_overflowed);
> -#endif
> +
> + if (0) {
> + lp_build_printf(gallivm, " stride = %u\n", stride);
> + lp_build_printf(gallivm, " buffer size adj = %u\n", buffer_size_adj);
> + lp_build_print_value(gallivm, " buffer overflowed = ", buffer_overflowed);
> + }
>
> lp_build_if(&if_ctx, gallivm, buffer_overflowed);
> {
> @@ -766,36 +723,34 @@ generate_fetch(struct gallivm_state *gallivm,
>
> static void
> convert_to_soa(struct gallivm_state *gallivm,
> - LLVMValueRef (*src_aos)[LP_MAX_VECTOR_WIDTH / 32],
> + LLVMValueRef src_aos[LP_MAX_VECTOR_WIDTH / 32],
> LLVMValueRef (*dst_soa)[TGSI_NUM_CHANNELS],
> - unsigned num_attribs, const struct lp_type soa_type)
> + unsigned attrib, const struct lp_type soa_type)
> {
> - unsigned i, j, k;
> + unsigned j, k;
> struct lp_type aos_channel_type = soa_type;
>
> + LLVMValueRef aos_channels[TGSI_NUM_CHANNELS];
> + unsigned pixels_per_channel = soa_type.length / TGSI_NUM_CHANNELS;
> +
> debug_assert(TGSI_NUM_CHANNELS == 4);
> debug_assert((soa_type.length % TGSI_NUM_CHANNELS) == 0);
>
> aos_channel_type.length >>= 1;
>
> - for (i = 0; i < num_attribs; ++i) {
> - LLVMValueRef aos_channels[TGSI_NUM_CHANNELS];
> - unsigned pixels_per_channel = soa_type.length / TGSI_NUM_CHANNELS;
> -
> - for (j = 0; j < TGSI_NUM_CHANNELS; ++j) {
> - LLVMValueRef channel[LP_MAX_VECTOR_LENGTH] = { 0 };
> + for (j = 0; j < TGSI_NUM_CHANNELS; ++j) {
> + LLVMValueRef channel[LP_MAX_VECTOR_LENGTH] = { 0 };
>
> - assert(pixels_per_channel <= LP_MAX_VECTOR_LENGTH);
> -
> - for (k = 0; k < pixels_per_channel; ++k) {
> - channel[k] = src_aos[i][j + TGSI_NUM_CHANNELS * k];
> - }
> + assert(pixels_per_channel <= LP_MAX_VECTOR_LENGTH);
>
> - aos_channels[j] = lp_build_concat(gallivm, channel, aos_channel_type, pixels_per_channel);
> + for (k = 0; k < pixels_per_channel; ++k) {
> + channel[k] = src_aos[j + TGSI_NUM_CHANNELS * k];
> }
>
> - lp_build_transpose_aos(gallivm, soa_type, aos_channels, dst_soa[i]);
> + aos_channels[j] = lp_build_concat(gallivm, channel, aos_channel_type, pixels_per_channel);
> }
> +
> + lp_build_transpose_aos(gallivm, soa_type, aos_channels, dst_soa[attrib]);
> }
>
>
> @@ -1549,6 +1504,13 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant,
> LLVMValueRef io_ptr, vbuffers_ptr, vb_ptr;
> LLVMValueRef zero = lp_build_const_int32(gallivm, 0);
> LLVMValueRef one = lp_build_const_int32(gallivm, 1);
> + LLVMValueRef vb_stride[PIPE_MAX_SHADER_INPUTS];
> + LLVMValueRef map_ptr[PIPE_MAX_SHADER_INPUTS];
> + LLVMValueRef buffer_size_adj[PIPE_MAX_SHADER_INPUTS];
> + LLVMValueRef stride_fixed[PIPE_MAX_SHADER_INPUTS];
> + LLVMValueRef ofbit[PIPE_MAX_SHADER_INPUTS];
> + LLVMValueRef instance_index[PIPE_MAX_SHADER_INPUTS];
> +
> struct draw_context *draw = llvm->draw;
> const struct tgsi_shader_info *vs_info = &draw->vs.vertex_shader->info;
> unsigned i, j;
> @@ -1693,14 +1655,67 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant,
>
> fetch_max = LLVMBuildSub(builder, end, one, "fetch_max");
>
> + /*
> + * Pre-calculate everything which is constant per shader invocation.
> + */
> + for (j = 0; j < draw->pt.nr_vertex_elements; ++j) {
> + LLVMValueRef vb_buffer_offset, buffer_size;
> + LLVMValueRef vb_info, vbuffer_ptr;
> + struct pipe_vertex_element *velem = &draw->pt.vertex_element[j];
> + LLVMValueRef vb_index =
> + lp_build_const_int32(gallivm, velem->vertex_buffer_index);
> + LLVMValueRef bsize = lp_build_const_int32(gallivm,
> + util_format_get_blocksize(velem->src_format));
> + LLVMValueRef src_offset = lp_build_const_int32(gallivm,
> + velem->src_offset);
> +
> + vbuffer_ptr = LLVMBuildGEP(builder, vbuffers_ptr, &vb_index, 1, "");
> + vb_info = LLVMBuildGEP(builder, vb_ptr, &vb_index, 1, "");
> + vb_stride[j] = draw_jit_vbuffer_stride(gallivm, vb_info);
> + vb_buffer_offset = draw_jit_vbuffer_offset(gallivm, vb_info);
> + map_ptr[j] = draw_jit_dvbuffer_map(gallivm, vbuffer_ptr);
> + buffer_size = draw_jit_dvbuffer_size(gallivm, vbuffer_ptr);
> +
> + ofbit[j] = NULL;
> + stride_fixed[j] = lp_build_uadd_overflow(gallivm, vb_buffer_offset,
> + src_offset, &ofbit[j]);
> + buffer_size_adj[j] = lp_build_usub_overflow(gallivm, buffer_size, bsize,
> + &ofbit[j]);
> +
> + if (velem->instance_divisor) {
> + /* Index is equal to the start instance plus the number of current
> + * instance divided by the divisor. In this case we compute it as:
> + * index = start_instance + (instance_id / divisor)
> + */
> + LLVMValueRef current_instance;
> + current_instance = LLVMBuildUDiv(builder, system_values.instance_id,
> + lp_build_const_int32(gallivm,
> + velem->instance_divisor),
> + "instance_divisor");
> + instance_index[j] = lp_build_uadd_overflow(gallivm, start_instance,
> + current_instance, &ofbit[j]);
> + }
> +
> + if (0) {
> + lp_build_printf(gallivm, "vbuf index = %u, vb_stride is %u\n",
> + vb_index, vb_stride[j]);
> + lp_build_printf(gallivm, " vb_buffer_offset = %u, src_offset is %u\n",
> + vb_buffer_offset, src_offset);
> + lp_build_print_value(gallivm, " blocksize = ", bsize);
> + lp_build_printf(gallivm, " instance_id = %u\n", system_values.instance_id);
> + lp_build_printf(gallivm, " buffer size = %u\n", buffer_size);
> + }
> + }
> +
> lp_build_loop_begin(&lp_loop, gallivm, zero);
> {
> LLVMValueRef inputs[PIPE_MAX_SHADER_INPUTS][TGSI_NUM_CHANNELS];
> - LLVMValueRef aos_attribs[PIPE_MAX_SHADER_INPUTS][LP_MAX_VECTOR_WIDTH / 32] = { { 0 } };
> + LLVMValueRef aos_attribs[LP_MAX_VECTOR_WIDTH / 32] = { 0 };
> LLVMValueRef io;
> LLVMValueRef clipmask; /* holds the clipmask value */
> LLVMValueRef true_index_array = lp_build_zero(gallivm,
> lp_type_uint_vec(32, 32*vector_length));
> + LLVMValueRef true_indices[LP_MAX_VECTOR_WIDTH / 32];
> const LLVMValueRef (*ptr_aos)[TGSI_NUM_CHANNELS];
>
> io_itr = lp_loop.counter;
> @@ -1760,22 +1775,25 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant,
> lp_build_endif(&if_ctx);
> true_index = LLVMBuildLoad(builder, index_ptr, "true_index");
> }
> + true_indices[i] = true_index;
> true_index_array = LLVMBuildInsertElement(
> gallivm->builder, true_index_array, true_index,
> lp_build_const_int32(gallivm, i), "");
> + }
> +
> + for (j = 0; j < key->nr_vertex_elements; ++j) {
> + struct pipe_vertex_element *velem = &draw->pt.vertex_element[j];
> + const struct util_format_description *format_desc =
> + util_format_description(velem->src_format);
>
> - for (j = 0; j < key->nr_vertex_elements; ++j) {
> - struct pipe_vertex_element *velem = &draw->pt.vertex_element[j];
> - LLVMValueRef vb_index =
> - lp_build_const_int32(gallivm, velem->vertex_buffer_index);
> - LLVMValueRef vb = LLVMBuildGEP(builder, vb_ptr, &vb_index, 1, "");
> - generate_fetch(gallivm, draw, vbuffers_ptr,
> - &aos_attribs[j][i], velem, vb, true_index,
> - system_values.instance_id, start_instance);
> + for (i = 0; i < vector_length; ++i) {
> + generate_fetch(gallivm, draw, format_desc,
> + vb_stride[j], stride_fixed[j], map_ptr[j],
> + buffer_size_adj[j], ofbit[j], &aos_attribs[i],
> + velem->instance_divisor ? instance_index[j] : true_indices[i]);
> }
> + convert_to_soa(gallivm, aos_attribs, inputs, j, vs_type);
> }
> - convert_to_soa(gallivm, aos_attribs, inputs,
> - key->nr_vertex_elements, vs_type);
>
> /* In the paths with elts vertex id has to be unaffected by the
> * index bias and because indices inside our elements array have
> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit_overflow.c b/src/gallium/auxiliary/gallivm/lp_bld_arit_overflow.c
> index 91247fd..152ad57 100644
> --- a/src/gallium/auxiliary/gallivm/lp_bld_arit_overflow.c
> +++ b/src/gallium/auxiliary/gallivm/lp_bld_arit_overflow.c
> @@ -127,6 +127,30 @@ lp_build_uadd_overflow(struct gallivm_state *gallivm,
> }
>
> /**
> + * Performs unsigned subtraction of two integers and reports
> + * overflow if detected.
> + *
> + * The values @a and @b must be of the same integer type. If
> + * an overflow is detected the IN/OUT @ofbit parameter is used:
> + * - if it's pointing to a null value, the overflow bit is simply
> + * stored inside the variable it's pointing to,
> + * - if it's pointing to a valid value, then that variable,
> + * which must be of i1 type, is ORed with the newly detected
> + * overflow bit. This is done to allow chaining of a number of
> + * overflow functions together without having to test the
> + * overflow bit after every single one.
> + */
> +LLVMValueRef
> +lp_build_usub_overflow(struct gallivm_state *gallivm,
> + LLVMValueRef a,
> + LLVMValueRef b,
> + LLVMValueRef *ofbit)
> +{
> + return build_binary_int_overflow(gallivm, "llvm.usub.with.overflow",
> + a, b, ofbit);
> +}
> +
> +/**
> * Performs unsigned multiplication of two integers and
> * reports overflow if detected.
> *
> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit_overflow.h b/src/gallium/auxiliary/gallivm/lp_bld_arit_overflow.h
> index 8c35a04..34ce00e 100644
> --- a/src/gallium/auxiliary/gallivm/lp_bld_arit_overflow.h
> +++ b/src/gallium/auxiliary/gallivm/lp_bld_arit_overflow.h
> @@ -49,6 +49,12 @@ lp_build_uadd_overflow(struct gallivm_state *gallivm,
> LLVMValueRef *ofbit);
>
> LLVMValueRef
> +lp_build_usub_overflow(struct gallivm_state *gallivm,
> + LLVMValueRef a,
> + LLVMValueRef b,
> + LLVMValueRef *ofbit);
> +
> +LLVMValueRef
> lp_build_umul_overflow(struct gallivm_state *gallivm,
> LLVMValueRef a,
> LLVMValueRef b,
>
Looks good to me AFAICT.
Reviewed-by: Jose Fonseca <jfonseca at vmware.com>
More information about the mesa-dev
mailing list