[Mesa-dev] [PATCH 6/7] radeonsi: move building llvm.SI.load.const into ac_build_buffer_load

Fri May 26 10:43:46 UTC 2017

On May 26, 2017 12:04 PM, "Nicolai Hähnle" <nhaehnle at gmail.com> wrote:

On 25.05.2017 19:04, Marek Olšák wrote:

> From: Marek Olšák <marek.olsak at amd.com>
>
> ---
>   src/amd/common/ac_llvm_build.c           | 46
> +++++++++++++++++++++++---------
>   src/amd/common/ac_llvm_build.h           |  3 ++-
>   src/amd/common/ac_nir_to_llvm.c          |  2 +-
>   src/gallium/drivers/radeonsi/si_shader.c | 23 +++++++---------
>   4 files changed, 47 insertions(+), 27 deletions(-)
>
> diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build
> .c
> index 3df9f53..237e929 100644
> --- a/src/amd/common/ac_llvm_build.c
> +++ b/src/amd/common/ac_llvm_build.c
> @@ -626,47 +626,69 @@ ac_build_buffer_store_dword(struct ac_llvm_context
> *ctx,
>   LLVMValueRef
>   ac_build_buffer_load(struct ac_llvm_context *ctx,
>                      LLVMValueRef rsrc,
>                      int num_channels,
>                      LLVMValueRef vindex,
>                      LLVMValueRef voffset,
>                      LLVMValueRef soffset,
>                      unsigned inst_offset,
>                      unsigned glc,
>                      unsigned slc,
> -                    bool can_speculate)
> +                    bool can_speculate,
> +                    bool allow_smem)
>   {
> +       LLVMValueRef offset = LLVMConstInt(ctx->i32, inst_offset, 0);
> +       if (voffset)
> +               offset = LLVMBuildAdd(ctx->builder, offset, voffset, "");
> +       if (soffset)
> +               offset = LLVMBuildAdd(ctx->builder, offset, soffset, "");
> +
> +       /* TODO: VI and later generations can use SMEM with GLC=1.*/
> +       if (allow_smem && !glc && !slc) {
>

This still needs to depend on can_speculate.

At least, looking at patch #7, if allow_smem were enabled for shader buffer
loads, we'd still only want to really use SMEM for read_only buffers.
Unless the plan is to put the responsibility for setting allow_smem in the
caller?

Yes, the caller should decide that. For example, if GLC is 1, we can use
SMEM freely on VI. There are also SMEM stores and atomics on VI, which
allow even more flexibility.

On a different note, we may eventually want to use one atomic per wave for
atomic Inc. Instead of messing with the EXEC mask and readfirstlane to pass
the result to other threads, we could just use SMEM atomic. That's also
easier than using GDS, which can only be accessed via vector opcodes. GDS
would suffer heavily from bank conflicts if we didn't do just one op per
wave.

Also, there are shaders that store a value using a constant offset. That's
another opportunity for using SMEM.

Anyway, I think this function can't recognize all these complex cases.

Marek

Cheers,
Nicolai

+               assert(vindex == NULL);
> +
> +               LLVMValueRef result[4];
> +
> +               for (int i = 0; i < num_channels; i++) {
> +                       if (i) {
> +                               offset = LLVMBuildAdd(ctx->builder, offset,
> +
>  LLVMConstInt(ctx->i32, 4, 0), "");
> +                       }
> +                       LLVMValueRef args[2] = {rsrc, offset};
> +                       result[i] = ac_build_intrinsic(ctx,
> "llvm.SI.load.const.v4i32",
> +                                                      ctx->f32, args, 2,
> +
> AC_FUNC_ATTR_READNONE |
> +
> AC_FUNC_ATTR_LEGACY);
> +               }
> +               if (num_channels == 1)
> +                       return result[0];
> +
> +               if (num_channels == 3)
> +                       result[num_channels++] = LLVMGetUndef(ctx->f32);
> +               return ac_build_gather_values(ctx, result, num_channels);
> +       }
> +
>         unsigned func = CLAMP(num_channels, 1, 3) - 1;
>         LLVMValueRef args[] = {
>                 LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, ""),
>                 vindex ? vindex : LLVMConstInt(ctx->i32, 0, 0),
> -               LLVMConstInt(ctx->i32, inst_offset, 0),
> +               offset,
>                 LLVMConstInt(ctx->i1, glc, 0),
>                 LLVMConstInt(ctx->i1, slc, 0)
>         };
>         LLVMTypeRef types[] = {ctx->f32, LLVMVectorType(ctx->f32, 2),
>                                ctx->v4f32};
>         const char *type_names[] = {"f32", "v2f32", "v4f32"};
>         char name[256];
>   -     if (voffset) {
> -               args[2] = LLVMBuildAdd(ctx->builder, args[2], voffset,
> -                               "");
> -       }
> -
> -       if (soffset) {
> -               args[2] = LLVMBuildAdd(ctx->builder, args[2], soffset,
> -                               "");
> -       }
> -
>         snprintf(name, sizeof(name), "llvm.amdgcn.buffer.load.%s",
>                  type_names[func]);
>         return ac_build_intrinsic(ctx, name, types[func], args,
>                                   ARRAY_SIZE(args),
>                                   /* READNONE means writes can't affect
> it, while
>                                    * READONLY means that writes can affect
> it. */
>                                   can_speculate && HAVE_LLVM >= 0x0400 ?
>                                           AC_FUNC_ATTR_READNONE :
>                                           AC_FUNC_ATTR_READONLY);
> diff --git a/src/amd/common/ac_llvm_build.h b/src/amd/common/ac_llvm_build
> .h
> index c1b5f3d..ebb78fb 100644
> --- a/src/amd/common/ac_llvm_build.h
> +++ b/src/amd/common/ac_llvm_build.h
> @@ -136,21 +136,22 @@ ac_build_buffer_store_dword(struct ac_llvm_context
> *ctx,
>   LLVMValueRef
>   ac_build_buffer_load(struct ac_llvm_context *ctx,
>                      LLVMValueRef rsrc,
>                      int num_channels,
>                      LLVMValueRef vindex,
>                      LLVMValueRef voffset,
>                      LLVMValueRef soffset,
>                      unsigned inst_offset,
>                      unsigned glc,
>                      unsigned slc,
> -                    bool can_speculate);
> +                    bool can_speculate,
> +                    bool allow_smem);
>     LLVMValueRef ac_build_buffer_load_format(struct ac_llvm_context *ctx,
>                                          LLVMValueRef rsrc,
>                                          LLVMValueRef vindex,
>                                          LLVMValueRef voffset,
>                                          bool can_speculate);
>     LLVMValueRef
>   ac_get_thread_id(struct ac_llvm_context *ctx);
>   diff --git a/src/amd/common/ac_nir_to_llvm.c
> b/src/amd/common/ac_nir_to_llvm.c
> index 8ae0a75..28ba47d 100644
> --- a/src/amd/common/ac_nir_to_llvm.c
> +++ b/src/amd/common/ac_nir_to_llvm.c
> @@ -2816,21 +2816,21 @@ load_tes_input(struct nir_to_llvm_context *ctx,
>         param = shader_io_get_unique_index(instr->variables[0]->var->data.
> location);
>         if (instr->variables[0]->var->data.location ==
> VARYING_SLOT_CLIP_DIST0 &&
>             is_compact && const_index > 3) {
>                 const_index -= 3;
>                 param++;
>         }
>         buf_addr = get_tcs_tes_buffer_address_params(ctx, param,
> const_index,
>                                                      is_compact,
> vertex_index, indir_index);
>         result = ac_build_buffer_load(&ctx->ac, ctx->hs_ring_tess_offchip,
> instr->num_components, NULL,
> -                                     buf_addr, ctx->oc_lds, is_compact ?
> (4 * const_index) : 0, 1, 0, true);
> +                                     buf_addr, ctx->oc_lds, is_compact ?
> (4 * const_index) : 0, 1, 0, true, false);
>         result = trim_vector(ctx, result, instr->num_components);
>         result = LLVMBuildBitCast(ctx->builder, result, get_def_type(ctx,
> &instr->dest.ssa), "");
>         return result;
>   }
>     static LLVMValueRef
>   load_gs_input(struct nir_to_llvm_context *ctx,
>               nir_intrinsic_instr *instr)
>   {
>         LLVMValueRef indir_index, vtx_offset;
> diff --git a/src/gallium/drivers/radeonsi/si_shader.c
> b/src/gallium/drivers/radeonsi/si_shader.c
> index fe0cd4a..c2b3cae 100644
> --- a/src/gallium/drivers/radeonsi/si_shader.c
> +++ b/src/gallium/drivers/radeonsi/si_shader.c
> @@ -826,39 +826,39 @@ static LLVMValueRef buffer_load(struct
> lp_build_tgsi_context *bld_base,
>                                   LLVMValueRef base, bool can_speculate)
>   {
>         struct si_shader_context *ctx = si_shader_context(bld_base);
>         struct gallivm_state *gallivm = &ctx->gallivm;
>         LLVMValueRef value, value2;
>         LLVMTypeRef llvm_type = tgsi2llvmtype(bld_base, type);
>         LLVMTypeRef vec_type = LLVMVectorType(llvm_type, 4);
>         if (swizzle == ~0) {
>                 value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL,
> base, offset,
> -                                            0, 1, 0, can_speculate);
> +                                            0, 1, 0, can_speculate,
> false);
>                 return LLVMBuildBitCast(gallivm->builder, value,
> vec_type, "");
>         }
>         if (!tgsi_type_is_64bit(type)) {
>                 value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL,
> base, offset,
> -                                            0, 1, 0, can_speculate);
> +                                            0, 1, 0, can_speculate,
> false);
>                 value = LLVMBuildBitCast(gallivm->builder, value,
> vec_type, "");
>                 return LLVMBuildExtractElement(gallivm->builder, value,
>                                     LLVMConstInt(ctx->i32, swizzle, 0),
> "");
>         }
>         value = ac_build_buffer_load(&ctx->ac, buffer, 1, NULL, base,
> offset,
> -                                 swizzle * 4, 1, 0, can_speculate);
> +                                 swizzle * 4, 1, 0, can_speculate, false);
>         value2 = ac_build_buffer_load(&ctx->ac, buffer, 1, NULL, base,
> offset,
> -                                  swizzle * 4 + 4, 1, 0, can_speculate);
> +                                  swizzle * 4 + 4, 1, 0, can_speculate,
> false);
>         return si_llvm_emit_fetch_64bit(bld_base, type, value, value2);
>   }
>     /**
>    * Load from LDS.
>    *
>    * \param type                output value type
>    * \param swizzle     offset (typically 0..3); it can be ~0, which loads
> a vec4
>    * \param dw_addr     address in dwords
> @@ -1147,28 +1147,28 @@ static LLVMValueRef fetch_input_gs(
>                 vtx_offset_param += ctx->param_gs_vtx2_offset - 2;
>         }
>         vtx_offset = lp_build_mul_imm(uint,
>                                       LLVMGetParam(ctx->main_fn,
>                                                    vtx_offset_param),
>                                       4);
>         soffset = LLVMConstInt(ctx->i32, (param * 4 + swizzle) * 256, 0);
>         value = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1,
> ctx->i32_0,
> -                                    vtx_offset, soffset, 0, 1, 0, true);
> +                                    vtx_offset, soffset, 0, 1, 0, true,
> false);
>         if (tgsi_type_is_64bit(type)) {
>                 LLVMValueRef value2;
>                 soffset = LLVMConstInt(ctx->i32, (param * 4 + swizzle + 1)
> * 256, 0);
>                 value2 = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1,
>                                               ctx->i32_0, vtx_offset,
> soffset,
> -                                             0, 1, 0, true);
> +                                             0, 1, 0, true, false);
>                 return si_llvm_emit_fetch_64bit(bld_base, type,
>                                                 value, value2);
>         }
>         return LLVMBuildBitCast(gallivm->builder,
>                                 value,
>                                 tgsi2llvmtype(bld_base, type), "");
>   }
>     static int lookup_interp_param_index(unsigned interpolate, unsigned
> location)
>   {
> @@ -1382,26 +1382,22 @@ static LLVMValueRef get_sample_id(struct
> si_shader_context *ctx)
>   }
>       /**
>    * Load a dword from a constant buffer.
>    */
>   static LLVMValueRef buffer_load_const(struct si_shader_context *ctx,
>                                       LLVMValueRef resource,
>                                       LLVMValueRef offset)
>   {
> -       LLVMBuilderRef builder = ctx->gallivm.builder;
> -       LLVMValueRef args[2] = {resource, offset};
> -
> -       return lp_build_intrinsic(builder, "llvm.SI.load.const.v4i32",
> ctx->f32, args, 2,
> -                                 LP_FUNC_ATTR_READNONE |
> -                                 LP_FUNC_ATTR_LEGACY);
> +       return ac_build_buffer_load(&ctx->ac, resource, 1, NULL, offset,
> NULL,
> +                                   0, 0, 0, true, true);
>   }
>     static LLVMValueRef load_sample_position(struct si_shader_context
> *ctx, LLVMValueRef sample_id)
>   {
>         struct lp_build_context *uint_bld = &ctx->bld_base.uint_bld;
>         struct gallivm_state *gallivm = &ctx->gallivm;
>         LLVMBuilderRef builder = gallivm->builder;
>         LLVMValueRef desc = LLVMGetParam(ctx->main_fn,
> ctx->param_rw_buffers);
>         LLVMValueRef buf_index = LLVMConstInt(ctx->i32,
> SI_PS_CONST_SAMPLE_POSITIONS, 0);
>         LLVMValueRef resource = ac_build_indexed_load_const(&ctx->ac,
> desc, buf_index);
> @@ -5201,21 +5197,22 @@ si_generate_gs_copy_shader(struct si_screen
> *sscreen,
>                                 }
>                                 LLVMValueRef soffset =
> LLVMConstInt(ctx.i32,
>                                         offset *
> gs_selector->gs_max_out_vertices * 16 * 4, 0);
>                                 offset++;
>                                 outputs[i].values[chan] =
>                                         ac_build_buffer_load(&ctx.ac,
>
>  ctx.gsvs_ring[0], 1,
>                                                              ctx.i32_0,
> voffset,
> -                                                            soffset, 0,
> 1, 1, true);
> +                                                            soffset, 0,
> 1, 1,
> +                                                            true, false);
>                         }
>                 }
>                 /* Streamout and exports. */
>                 if (gs_selector->so.num_outputs) {
>                         si_llvm_emit_streamout(&ctx, outputs,
>                                                gsinfo->num_outputs,
>                                                stream);
>                 }
>
>

-- 
Lerne, wie die Welt wirklich ist,
Aber vergiss niemals, wie sie sein sollte.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.freedesktop.org/archives/mesa-dev/attachments/20170526/8da2dad3/attachment-0001.html>