[Mesa-dev] [PATCH 1/2] gallivm: don't use control flow when doing indirect constant buffer lookups
Jose Fonseca
jfonseca at vmware.com
Wed Apr 8 12:13:42 PDT 2015
Series looks good to me.
Just a few suggestions inline.
On 04/04/15 15:50, sroland at vmware.com wrote:
> From: Roland Scheidegger <sroland at vmware.com>
>
> llvm goes crazy when doing that, using way more memory and time, though there's
> probably more to it - this points to a very much similar issue as fixed in
> 8a9f5ecdb116d0449d63f7b94efbfa8b205d826f. In any case I've seen a quite
> plain looking vertex shader with just ~50 simple tgsi instructions (but with a
> dozen or so such indirect constant buffer lookups) go from a terribly high
> ~440ms compile time (consuming 25MB of memory in the process) down to a still
> awful ~230ms and 13MB with this fix (with llvm 3.3), so there's still obvious
> improvements possible (but I have no clue why it's so slow...).
> The resulting shader is most likely also faster (certainly seemed so though
> I don't have any hard numbers as it may have been influenced by compile times)
> since generally fetching constants outside the buffer range is most likely an
> app error (that is we expect all indices to be valid).
> It is possible this fixes some mysterious vertex shader slowdowns we've seen
> ever since we are conforming to newer apis at least partially (the main draw
> loop also has similar looking conditionals which we probably could do without -
> if not for the fetch at least for the additional elts condition.)
> ---
> src/gallium/auxiliary/draw/draw_llvm.h | 2 +
> .../draw/draw_pt_fetch_shade_pipeline_llvm.c | 27 +++---
> src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c | 95 +++++++++-------------
> src/gallium/drivers/llvmpipe/lp_scene.h | 2 +
> src/gallium/drivers/llvmpipe/lp_setup.c | 6 +-
> 5 files changed, 63 insertions(+), 69 deletions(-)
>
> diff --git a/src/gallium/auxiliary/draw/draw_llvm.h b/src/gallium/auxiliary/draw/draw_llvm.h
> index 9565fc6..a1983e1 100644
> --- a/src/gallium/auxiliary/draw/draw_llvm.h
> +++ b/src/gallium/auxiliary/draw/draw_llvm.h
> @@ -472,6 +472,8 @@ struct draw_llvm {
>
> struct draw_gs_llvm_variant_list_item gs_variants_list;
> int nr_gs_variants;
> +
> + float fake_const_buf[4];
Couldn't we make fake_const_buf a mere local static const array instead?
It would save memory.
> };
>
>
> diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c
> index 0dfafdc..03257d8 100644
> --- a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c
> +++ b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c
> @@ -273,28 +273,35 @@ llvm_middle_end_bind_parameters(struct draw_pt_middle_end *middle)
> {
> struct llvm_middle_end *fpme = llvm_middle_end(middle);
> struct draw_context *draw = fpme->draw;
> + struct draw_llvm *llvm = fpme->llvm;
> unsigned i;
>
> - for (i = 0; i < Elements(fpme->llvm->jit_context.vs_constants); ++i) {
> + for (i = 0; i < Elements(llvm->jit_context.vs_constants); ++i) {
> int num_consts =
> draw->pt.user.vs_constants_size[i] / (sizeof(float) * 4);
> - fpme->llvm->jit_context.vs_constants[i] = draw->pt.user.vs_constants[i];
> - fpme->llvm->jit_context.num_vs_constants[i] = num_consts;
> + llvm->jit_context.vs_constants[i] = draw->pt.user.vs_constants[i];
> + llvm->jit_context.num_vs_constants[i] = num_consts;
> + if (num_consts == 0) {
> + llvm->jit_context.vs_constants[i] = llvm->fake_const_buf;
> + }
> }
> - for (i = 0; i < Elements(fpme->llvm->gs_jit_context.constants); ++i) {
> + for (i = 0; i < Elements(llvm->gs_jit_context.constants); ++i) {
> int num_consts =
> draw->pt.user.gs_constants_size[i] / (sizeof(float) * 4);
> - fpme->llvm->gs_jit_context.constants[i] = draw->pt.user.gs_constants[i];
> - fpme->llvm->gs_jit_context.num_constants[i] = num_consts;
> + llvm->gs_jit_context.constants[i] = draw->pt.user.gs_constants[i];
> + llvm->gs_jit_context.num_constants[i] = num_consts;
> + if (num_consts == 0) {
> + llvm->gs_jit_context.constants[i] = llvm->fake_const_buf;
> + }
> }
>
> - fpme->llvm->jit_context.planes =
> + llvm->jit_context.planes =
> (float (*)[DRAW_TOTAL_CLIP_PLANES][4]) draw->pt.user.planes[0];
> - fpme->llvm->gs_jit_context.planes =
> + llvm->gs_jit_context.planes =
> (float (*)[DRAW_TOTAL_CLIP_PLANES][4]) draw->pt.user.planes[0];
>
> - fpme->llvm->jit_context.viewports = draw->viewports;
> - fpme->llvm->gs_jit_context.viewports = draw->viewports;
> + llvm->jit_context.viewports = draw->viewports;
> + llvm->gs_jit_context.viewports = draw->viewports;
> }
>
>
> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
> index 17b68ff..5aa2846 100644
> --- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
> +++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
> @@ -944,20 +944,39 @@ gather_outputs(struct lp_build_tgsi_soa_context * bld)
> * with a little work.
> */
> static LLVMValueRef
> -build_gather(struct lp_build_context *bld,
> +build_gather(struct lp_build_tgsi_context *bld_base,
> LLVMValueRef base_ptr,
> LLVMValueRef indexes,
> - LLVMValueRef *overflow_mask)
> + LLVMValueRef overflow_mask)
> {
> - LLVMBuilderRef builder = bld->gallivm->builder;
> + struct gallivm_state *gallivm = bld_base->base.gallivm;
> + LLVMBuilderRef builder = gallivm->builder;
> + struct lp_build_context *uint_bld = &bld_base->uint_bld;
> + struct lp_build_context *bld = &bld_base->base;
> LLVMValueRef res = bld->undef;
> unsigned i;
> - LLVMValueRef temp_ptr = NULL;
> +
> + /*
> + * overflow_mask is a vector telling us which channels
> + * in the vector overflowed. We use the overflow behavior for
> + * constant buffers which is defined as:
> + * Out of bounds access to constant buffer returns 0 in all
> + * components. Out of bounds behavior is always with respect
> + * to the size of the buffer bound at that slot.
> + */
>
> if (overflow_mask) {
> - temp_ptr = lp_build_alloca(
> - bld->gallivm,
> - lp_build_vec_type(bld->gallivm, bld->type), "");
> + /*
> + * We avoid per-element control flow here (also due to llvm going crazy,
> + * though I suspect it's better anyway since overflow is likely rare).
> + * Note that since we still fetch from buffers even if num_elements was
> + * zero (in this case we'll fetch from index zero) the jit func callers
> + * MUST provide valid fake constant buffers of size 4x32 (the values do
> + * not matter), otherwise we'd still need (not per element though)
> + * control flow.
> + */
> + LLVMValueRef zeros = lp_build_const_int_vec(gallivm, uint_bld->type, 0);
Instead of recreating zeros, you can just use uint_bld->zero , here and
below.
> + indexes = lp_build_select(uint_bld, overflow_mask, zeros, indexes);
> }
>
> /*
> @@ -968,53 +987,17 @@ build_gather(struct lp_build_context *bld,
> LLVMValueRef index = LLVMBuildExtractElement(builder,
> indexes, ii, "");
> LLVMValueRef scalar_ptr, scalar;
> - LLVMValueRef overflow;
> - struct lp_build_if_state if_ctx;
> -
> - /*
> - * overflow_mask is a boolean vector telling us which channels
> - * in the vector overflowed. We use the overflow behavior for
> - * constant buffers which is defined as:
> - * Out of bounds access to constant buffer returns 0 in all
> - * componenets. Out of bounds behavior is always with respect
> - * to the size of the buffer bound at that slot.
> - */
> - if (overflow_mask) {
> - overflow = LLVMBuildExtractElement(builder, *overflow_mask,
> - ii, "");
> - lp_build_if(&if_ctx, bld->gallivm, overflow);
> - {
> - LLVMValueRef val = LLVMBuildLoad(builder, temp_ptr, "");
> - val = LLVMBuildInsertElement(
> - builder, val,
> - LLVMConstNull(LLVMFloatTypeInContext(bld->gallivm->context)),
> - ii, "");
> - LLVMBuildStore(builder, val, temp_ptr);
> - }
> - lp_build_else(&if_ctx);
> - {
> - LLVMValueRef val = LLVMBuildLoad(builder, temp_ptr, "");
> -
> - scalar_ptr = LLVMBuildGEP(builder, base_ptr,
> - &index, 1, "gather_ptr");
> - scalar = LLVMBuildLoad(builder, scalar_ptr, "");
> -
> - val = LLVMBuildInsertElement(builder, val, scalar, ii, "");
>
> - LLVMBuildStore(builder, val, temp_ptr);
> - }
> - lp_build_endif(&if_ctx);
> - } else {
> - scalar_ptr = LLVMBuildGEP(builder, base_ptr,
> - &index, 1, "gather_ptr");
> - scalar = LLVMBuildLoad(builder, scalar_ptr, "");
> + scalar_ptr = LLVMBuildGEP(builder, base_ptr,
> + &index, 1, "gather_ptr");
> + scalar = LLVMBuildLoad(builder, scalar_ptr, "");
>
> - res = LLVMBuildInsertElement(builder, res, scalar, ii, "");
> - }
> + res = LLVMBuildInsertElement(builder, res, scalar, ii, "");
> }
>
> if (overflow_mask) {
> - res = LLVMBuildLoad(builder, temp_ptr, "gather_val");
> + LLVMValueRef zeros = lp_build_const_vec(gallivm, bld->type, 0.0);
bld->zero
> + res = lp_build_select(bld, overflow_mask, zeros, res);
> }
>
> return res;
> @@ -1247,17 +1230,15 @@ emit_fetch_constant(
> num_consts = lp_build_broadcast_scalar(uint_bld, num_consts);
> /* Construct a boolean vector telling us which channels
> * overflow the bound constant buffer */
> - overflow_mask = LLVMBuildICmp(builder, LLVMIntUGE,
> - indirect_index,
> - num_consts, "");
> + overflow_mask = lp_build_compare(gallivm, uint_bld->type, PIPE_FUNC_GEQUAL,
> + indirect_index, num_consts);
>
> /* index_vec = indirect_index * 4 + swizzle */
> index_vec = lp_build_shl_imm(uint_bld, indirect_index, 2);
> index_vec = lp_build_add(uint_bld, index_vec, swizzle_vec);
>
> /* Gather values from the constant buffer */
> - res = build_gather(&bld_base->base, consts_ptr, index_vec,
> - &overflow_mask);
> + res = build_gather(bld_base, consts_ptr, index_vec, overflow_mask);
> }
> else {
> LLVMValueRef index; /* index into the const buffer */
> @@ -1319,7 +1300,7 @@ emit_fetch_immediate(
> FALSE);
>
> /* Gather values from the immediate register array */
> - res = build_gather(&bld_base->base, imms_array, index_vec, NULL);
> + res = build_gather(bld_base, imms_array, index_vec, NULL);
> } else {
> LLVMValueRef lindex = lp_build_const_int32(gallivm,
> reg->Register.Index * 4 + swizzle);
> @@ -1373,7 +1354,7 @@ emit_fetch_input(
> inputs_array = LLVMBuildBitCast(builder, bld->inputs_array, fptr_type, "");
>
> /* Gather values from the input register array */
> - res = build_gather(&bld_base->base, inputs_array, index_vec, NULL);
> + res = build_gather(bld_base, inputs_array, index_vec, NULL);
> } else {
> if (bld->indirect_files & (1 << TGSI_FILE_INPUT)) {
> LLVMValueRef lindex = lp_build_const_int32(gallivm,
> @@ -1495,7 +1476,7 @@ emit_fetch_temporary(
> temps_array = LLVMBuildBitCast(builder, bld->temps_array, fptr_type, "");
>
> /* Gather values from the temporary register array */
> - res = build_gather(&bld_base->base, temps_array, index_vec, NULL);
> + res = build_gather(bld_base, temps_array, index_vec, NULL);
> }
> else {
> LLVMValueRef temp_ptr;
> diff --git a/src/gallium/drivers/llvmpipe/lp_scene.h b/src/gallium/drivers/llvmpipe/lp_scene.h
> index ad23c20..3ef0f32 100644
> --- a/src/gallium/drivers/llvmpipe/lp_scene.h
> +++ b/src/gallium/drivers/llvmpipe/lp_scene.h
> @@ -178,6 +178,8 @@ struct lp_scene {
>
> struct cmd_bin tile[TILES_X][TILES_Y];
> struct data_block_list data;
> +
> + float fake_const_buf[4];
> };
>
>
> diff --git a/src/gallium/drivers/llvmpipe/lp_setup.c b/src/gallium/drivers/llvmpipe/lp_setup.c
> index 3b0056c..bfe0af4 100644
> --- a/src/gallium/drivers/llvmpipe/lp_setup.c
> +++ b/src/gallium/drivers/llvmpipe/lp_setup.c
> @@ -1103,14 +1103,16 @@ try_update_scene_state( struct lp_setup_context *setup )
> setup->constants[i].stored_size = current_size;
> setup->constants[i].stored_data = stored;
> }
> + setup->fs.current.jit_context.constants[i] =
> + setup->constants[i].stored_data;
> }
> else {
> setup->constants[i].stored_size = 0;
> setup->constants[i].stored_data = NULL;
> + setup->fs.current.jit_context.constants[i] =
> + scene->fake_const_buf;
> }
>
> - setup->fs.current.jit_context.constants[i] =
> - setup->constants[i].stored_data;
> num_constants =
> setup->constants[i].stored_size / (sizeof(float) * 4);
> setup->fs.current.jit_context.num_constants[i] = num_constants;
>
Jose
More information about the mesa-dev
mailing list