[Mesa-dev] [PATCH 1/2] gallivm: don't use control flow when doing indirect constant buffer lookups

Jose Fonseca jfonseca at vmware.com
Wed Apr 8 12:13:42 PDT 2015


Series looks good to me.

Just a few suggestions inline.


On 04/04/15 15:50, sroland at vmware.com wrote:
> From: Roland Scheidegger <sroland at vmware.com>
>
> llvm goes crazy when doing that, using way more memory and time, though there's
> probably more to it - this points to a very much similar issue as fixed in
> 8a9f5ecdb116d0449d63f7b94efbfa8b205d826f. In any case I've seen a quite
> plain looking vertex shader with just ~50 simple tgsi instructions (but with a
> dozen or so such indirect constant buffer lookups) go from a terribly high
> ~440ms compile time (consuming 25MB of memory in the process) down to a still
> awful ~230ms and 13MB with this fix (with llvm 3.3), so there's still obvious
> improvements possible (but I have no clue why it's so slow...).
> The resulting shader is most likely also faster (certainly seemed so though
> I don't have any hard numbers as it may have been influenced by compile times)
> since generally fetching constants outside the buffer range is most likely an
> app error (that is we expect all indices to be valid).
> It is possible this fixes some mysterious vertex shader slowdowns we've seen
> ever since we are conforming to newer apis at least partially (the main draw
> loop also has similar looking conditionals which we probably could do without -
> if not for the fetch at least for the additional elts condition.)
> ---
>   src/gallium/auxiliary/draw/draw_llvm.h             |  2 +
>   .../draw/draw_pt_fetch_shade_pipeline_llvm.c       | 27 +++---
>   src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c    | 95 +++++++++-------------
>   src/gallium/drivers/llvmpipe/lp_scene.h            |  2 +
>   src/gallium/drivers/llvmpipe/lp_setup.c            |  6 +-
>   5 files changed, 63 insertions(+), 69 deletions(-)
>
> diff --git a/src/gallium/auxiliary/draw/draw_llvm.h b/src/gallium/auxiliary/draw/draw_llvm.h
> index 9565fc6..a1983e1 100644
> --- a/src/gallium/auxiliary/draw/draw_llvm.h
> +++ b/src/gallium/auxiliary/draw/draw_llvm.h
> @@ -472,6 +472,8 @@ struct draw_llvm {
>
>      struct draw_gs_llvm_variant_list_item gs_variants_list;
>      int nr_gs_variants;
> +
> +   float fake_const_buf[4];

Couldn't we make fake_const_buf a mere local static const array instead? 
It would save memory.

>   };
>
>
> diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c
> index 0dfafdc..03257d8 100644
> --- a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c
> +++ b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c
> @@ -273,28 +273,35 @@ llvm_middle_end_bind_parameters(struct draw_pt_middle_end *middle)
>   {
>      struct llvm_middle_end *fpme = llvm_middle_end(middle);
>      struct draw_context *draw = fpme->draw;
> +   struct draw_llvm *llvm = fpme->llvm;
>      unsigned i;
>
> -   for (i = 0; i < Elements(fpme->llvm->jit_context.vs_constants); ++i) {
> +   for (i = 0; i < Elements(llvm->jit_context.vs_constants); ++i) {
>         int num_consts =
>            draw->pt.user.vs_constants_size[i] / (sizeof(float) * 4);
> -      fpme->llvm->jit_context.vs_constants[i] = draw->pt.user.vs_constants[i];
> -      fpme->llvm->jit_context.num_vs_constants[i] = num_consts;
> +      llvm->jit_context.vs_constants[i] = draw->pt.user.vs_constants[i];
> +      llvm->jit_context.num_vs_constants[i] = num_consts;
> +      if (num_consts == 0) {
> +         llvm->jit_context.vs_constants[i] = llvm->fake_const_buf;
> +      }
>      }
> -   for (i = 0; i < Elements(fpme->llvm->gs_jit_context.constants); ++i) {
> +   for (i = 0; i < Elements(llvm->gs_jit_context.constants); ++i) {
>         int num_consts =
>            draw->pt.user.gs_constants_size[i] / (sizeof(float) * 4);
> -      fpme->llvm->gs_jit_context.constants[i] = draw->pt.user.gs_constants[i];
> -      fpme->llvm->gs_jit_context.num_constants[i] = num_consts;
> +      llvm->gs_jit_context.constants[i] = draw->pt.user.gs_constants[i];
> +      llvm->gs_jit_context.num_constants[i] = num_consts;
> +      if (num_consts == 0) {
> +         llvm->gs_jit_context.constants[i] = llvm->fake_const_buf;
> +      }
>      }
>
> -   fpme->llvm->jit_context.planes =
> +   llvm->jit_context.planes =
>         (float (*)[DRAW_TOTAL_CLIP_PLANES][4]) draw->pt.user.planes[0];
> -   fpme->llvm->gs_jit_context.planes =
> +   llvm->gs_jit_context.planes =
>         (float (*)[DRAW_TOTAL_CLIP_PLANES][4]) draw->pt.user.planes[0];
>
> -   fpme->llvm->jit_context.viewports = draw->viewports;
> -   fpme->llvm->gs_jit_context.viewports = draw->viewports;
> +   llvm->jit_context.viewports = draw->viewports;
> +   llvm->gs_jit_context.viewports = draw->viewports;
>   }
>
>
> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
> index 17b68ff..5aa2846 100644
> --- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
> +++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
> @@ -944,20 +944,39 @@ gather_outputs(struct lp_build_tgsi_soa_context * bld)
>    * with a little work.
>    */
>   static LLVMValueRef
> -build_gather(struct lp_build_context *bld,
> +build_gather(struct lp_build_tgsi_context *bld_base,
>                LLVMValueRef base_ptr,
>                LLVMValueRef indexes,
> -             LLVMValueRef *overflow_mask)
> +             LLVMValueRef overflow_mask)
>   {
> -   LLVMBuilderRef builder = bld->gallivm->builder;
> +   struct gallivm_state *gallivm = bld_base->base.gallivm;
> +   LLVMBuilderRef builder = gallivm->builder;
> +   struct lp_build_context *uint_bld = &bld_base->uint_bld;
> +   struct lp_build_context *bld = &bld_base->base;
>      LLVMValueRef res = bld->undef;
>      unsigned i;
> -   LLVMValueRef temp_ptr = NULL;
> +
> +   /*
> +    * overflow_mask is a vector telling us which channels
> +    * in the vector overflowed. We use the overflow behavior for
> +    * constant buffers which is defined as:
> +    * Out of bounds access to constant buffer returns 0 in all
> +    * components. Out of bounds behavior is always with respect
> +    * to the size of the buffer bound at that slot.
> +    */
>
>      if (overflow_mask) {
> -      temp_ptr = lp_build_alloca(
> -         bld->gallivm,
> -         lp_build_vec_type(bld->gallivm, bld->type), "");
> +      /*
> +       * We avoid per-element control flow here (also due to llvm going crazy,
> +       * though I suspect it's better anyway since overflow is likely rare).
> +       * Note that since we still fetch from buffers even if num_elements was
> +       * zero (in this case we'll fetch from index zero) the jit func callers
> +       * MUST provide valid fake constant buffers of size 4x32 (the values do
> +       * not matter), otherwise we'd still need (not per element though)
> +       * control flow.
> +       */
> +      LLVMValueRef zeros = lp_build_const_int_vec(gallivm, uint_bld->type, 0);

Instead of recreating zeros, you can just use uint_bld->zero , here and 
below.



> +      indexes = lp_build_select(uint_bld, overflow_mask, zeros, indexes);
>      }
>
>      /*
> @@ -968,53 +987,17 @@ build_gather(struct lp_build_context *bld,
>         LLVMValueRef index = LLVMBuildExtractElement(builder,
>                                                      indexes, ii, "");
>         LLVMValueRef scalar_ptr, scalar;
> -      LLVMValueRef overflow;
> -      struct lp_build_if_state if_ctx;
> -
> -      /*
> -       * overflow_mask is a boolean vector telling us which channels
> -       * in the vector overflowed. We use the overflow behavior for
> -       * constant buffers which is defined as:
> -       * Out of bounds access to constant buffer returns 0 in all
> -       * componenets. Out of bounds behavior is always with respect
> -       * to the size of the buffer bound at that slot.
> -       */
> -      if (overflow_mask) {
> -         overflow = LLVMBuildExtractElement(builder, *overflow_mask,
> -                                            ii, "");
> -         lp_build_if(&if_ctx, bld->gallivm, overflow);
> -         {
> -            LLVMValueRef val = LLVMBuildLoad(builder, temp_ptr, "");
> -            val = LLVMBuildInsertElement(
> -               builder, val,
> -               LLVMConstNull(LLVMFloatTypeInContext(bld->gallivm->context)),
> -               ii, "");
> -            LLVMBuildStore(builder, val, temp_ptr);
> -         }
> -         lp_build_else(&if_ctx);
> -         {
> -            LLVMValueRef val = LLVMBuildLoad(builder, temp_ptr, "");
> -
> -            scalar_ptr = LLVMBuildGEP(builder, base_ptr,
> -                                      &index, 1, "gather_ptr");
> -            scalar = LLVMBuildLoad(builder, scalar_ptr, "");
> -
> -            val = LLVMBuildInsertElement(builder, val, scalar, ii, "");
>
> -            LLVMBuildStore(builder, val, temp_ptr);
> -         }
> -         lp_build_endif(&if_ctx);
> -      } else {
> -         scalar_ptr = LLVMBuildGEP(builder, base_ptr,
> -                                   &index, 1, "gather_ptr");
> -         scalar = LLVMBuildLoad(builder, scalar_ptr, "");
> +      scalar_ptr = LLVMBuildGEP(builder, base_ptr,
> +                                &index, 1, "gather_ptr");
> +      scalar = LLVMBuildLoad(builder, scalar_ptr, "");
>
> -         res = LLVMBuildInsertElement(builder, res, scalar, ii, "");
> -      }
> +      res = LLVMBuildInsertElement(builder, res, scalar, ii, "");
>      }
>
>      if (overflow_mask) {
> -      res = LLVMBuildLoad(builder, temp_ptr, "gather_val");
> +      LLVMValueRef zeros = lp_build_const_vec(gallivm, bld->type, 0.0);

bld->zero

> +      res = lp_build_select(bld, overflow_mask, zeros, res);
>      }
>
>      return res;
> @@ -1247,17 +1230,15 @@ emit_fetch_constant(
>         num_consts = lp_build_broadcast_scalar(uint_bld, num_consts);
>         /* Construct a boolean vector telling us which channels
>          * overflow the bound constant buffer */
> -      overflow_mask = LLVMBuildICmp(builder, LLVMIntUGE,
> -                                    indirect_index,
> -                                    num_consts, "");
> +      overflow_mask = lp_build_compare(gallivm, uint_bld->type, PIPE_FUNC_GEQUAL,
> +                                       indirect_index, num_consts);
>
>         /* index_vec = indirect_index * 4 + swizzle */
>         index_vec = lp_build_shl_imm(uint_bld, indirect_index, 2);
>         index_vec = lp_build_add(uint_bld, index_vec, swizzle_vec);
>
>         /* Gather values from the constant buffer */
> -      res = build_gather(&bld_base->base, consts_ptr, index_vec,
> -                         &overflow_mask);
> +      res = build_gather(bld_base, consts_ptr, index_vec, overflow_mask);
>      }
>      else {
>         LLVMValueRef index;  /* index into the const buffer */
> @@ -1319,7 +1300,7 @@ emit_fetch_immediate(
>                                              FALSE);
>
>            /* Gather values from the immediate register array */
> -         res = build_gather(&bld_base->base, imms_array, index_vec, NULL);
> +         res = build_gather(bld_base, imms_array, index_vec, NULL);
>         } else {
>            LLVMValueRef lindex = lp_build_const_int32(gallivm,
>                                           reg->Register.Index * 4 + swizzle);
> @@ -1373,7 +1354,7 @@ emit_fetch_input(
>         inputs_array = LLVMBuildBitCast(builder, bld->inputs_array, fptr_type, "");
>
>         /* Gather values from the input register array */
> -      res = build_gather(&bld_base->base, inputs_array, index_vec, NULL);
> +      res = build_gather(bld_base, inputs_array, index_vec, NULL);
>      } else {
>         if (bld->indirect_files & (1 << TGSI_FILE_INPUT)) {
>            LLVMValueRef lindex = lp_build_const_int32(gallivm,
> @@ -1495,7 +1476,7 @@ emit_fetch_temporary(
>         temps_array = LLVMBuildBitCast(builder, bld->temps_array, fptr_type, "");
>
>         /* Gather values from the temporary register array */
> -      res = build_gather(&bld_base->base, temps_array, index_vec, NULL);
> +      res = build_gather(bld_base, temps_array, index_vec, NULL);
>      }
>      else {
>         LLVMValueRef temp_ptr;
> diff --git a/src/gallium/drivers/llvmpipe/lp_scene.h b/src/gallium/drivers/llvmpipe/lp_scene.h
> index ad23c20..3ef0f32 100644
> --- a/src/gallium/drivers/llvmpipe/lp_scene.h
> +++ b/src/gallium/drivers/llvmpipe/lp_scene.h
> @@ -178,6 +178,8 @@ struct lp_scene {
>
>      struct cmd_bin tile[TILES_X][TILES_Y];
>      struct data_block_list data;
> +
> +   float fake_const_buf[4];
>   };
>
>
> diff --git a/src/gallium/drivers/llvmpipe/lp_setup.c b/src/gallium/drivers/llvmpipe/lp_setup.c
> index 3b0056c..bfe0af4 100644
> --- a/src/gallium/drivers/llvmpipe/lp_setup.c
> +++ b/src/gallium/drivers/llvmpipe/lp_setup.c
> @@ -1103,14 +1103,16 @@ try_update_scene_state( struct lp_setup_context *setup )
>                  setup->constants[i].stored_size = current_size;
>                  setup->constants[i].stored_data = stored;
>               }
> +            setup->fs.current.jit_context.constants[i] =
> +               setup->constants[i].stored_data;
>            }
>            else {
>               setup->constants[i].stored_size = 0;
>               setup->constants[i].stored_data = NULL;
> +            setup->fs.current.jit_context.constants[i] =
> +               scene->fake_const_buf;
>            }
>
> -         setup->fs.current.jit_context.constants[i] =
> -            setup->constants[i].stored_data;
>            num_constants =
>               setup->constants[i].stored_size / (sizeof(float) * 4);
>            setup->fs.current.jit_context.num_constants[i] = num_constants;
>


Jose


More information about the mesa-dev mailing list