<div dir="auto"><div><br><div class="gmail_extra"><br><div class="gmail_quote">On May 26, 2017 12:04 PM, "Nicolai Hähnle" <<a href="mailto:nhaehnle@gmail.com">nhaehnle@gmail.com</a>> wrote:<br type="attribution"><blockquote class="quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"><div class="elided-text">On <a href="tel:25.05.2017%2019" value="+12505201719" target="_blank">25.05.2017 19</a>:04, Marek Olšák wrote:<br>
<blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
From: Marek Olšák <<a href="mailto:marek.olsak@amd.com" target="_blank">marek.olsak@amd.com</a>><br>
<br>
---<br>
  src/amd/common/ac_llvm_build.<wbr>c           | 46 +++++++++++++++++++++++-------<wbr>--<br>
  src/amd/common/ac_llvm_build.<wbr>h           |  3 ++-<br>
  src/amd/common/ac_nir_to_llvm.<wbr>c          |  2 +-<br>
  src/gallium/drivers/radeonsi/s<wbr>i_shader.c | 23 +++++++---------<br>
  4 files changed, 47 insertions(+), 27 deletions(-)<br>
<br>
diff --git a/src/amd/common/ac_llvm_build<wbr>.c b/src/amd/common/ac_llvm_build<wbr>.c<br>
index 3df9f53..237e929 100644<br>
--- a/src/amd/common/ac_llvm_build<wbr>.c<br>
+++ b/src/amd/common/ac_llvm_build<wbr>.c<br>
@@ -626,47 +626,69 @@ ac_build_buffer_store_dword(st<wbr>ruct ac_llvm_context *ctx,<br>
  LLVMValueRef<br>
  ac_build_buffer_load(struct ac_llvm_context *ctx,<br>
                     LLVMValueRef rsrc,<br>
                     int num_channels,<br>
                     LLVMValueRef vindex,<br>
                     LLVMValueRef voffset,<br>
                     LLVMValueRef soffset,<br>
                     unsigned inst_offset,<br>
                     unsigned glc,<br>
                     unsigned slc,<br>
-                    bool can_speculate)<br>
+                    bool can_speculate,<br>
+                    bool allow_smem)<br>
  {<br>
+       LLVMValueRef offset = LLVMConstInt(ctx->i32, inst_offset, 0);<br>
+       if (voffset)<br>
+               offset = LLVMBuildAdd(ctx->builder, offset, voffset, "");<br>
+       if (soffset)<br>
+               offset = LLVMBuildAdd(ctx->builder, offset, soffset, "");<br>
+<br>
+       /* TODO: VI and later generations can use SMEM with GLC=1.*/<br>
+       if (allow_smem && !glc && !slc) {<br>
</blockquote>
<br></div>
This still needs to depend on can_speculate.<br>
<br>
At least, looking at patch #7, if allow_smem were enabled for shader buffer loads, we'd still only want to really use SMEM for read_only buffers. Unless the plan is to put the responsibility for setting allow_smem in the caller?<br></blockquote></div></div></div><div dir="auto"><br></div><div dir="auto">Yes, the caller should decide that. For example, if GLC is 1, we can use SMEM freely on VI. There are also SMEM stores and atomics on VI, which allow even more flexibility.</div><div dir="auto"><br></div><div dir="auto">On a different note, we may eventually want to use one atomic per wave for atomic Inc. Instead of messing with the EXEC mask and readfirstlane to pass the result to other threads, we could just use SMEM atomic. That's also easier than using GDS, which can only be accessed via vector opcodes. GDS would suffer heavily from bank conflicts if we didn't do just one op per wave.</div><div dir="auto"><br></div><div dir="auto">Also, there are shaders that store a value using a constant offset. That's another opportunity for using SMEM.</div><div dir="auto"><br></div><div dir="auto">Anyway, I think this function can't recognize all these complex cases.</div><div dir="auto"><br></div><div dir="auto">Marek</div><div dir="auto"><br></div><div dir="auto"><div class="gmail_extra"><div class="gmail_quote"><blockquote class="quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
<br>
Cheers,<br>
Nicolai<div class="elided-text"><br>
<br>
<br>
<blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
+               assert(vindex == NULL);<br>
+<br>
+               LLVMValueRef result[4];<br>
+<br>
+               for (int i = 0; i < num_channels; i++) {<br>
+                       if (i) {<br>
+                               offset = LLVMBuildAdd(ctx->builder, offset,<br>
+                                                     LLVMConstInt(ctx->i32, 4, 0), "");<br>
+                       }<br>
+                       LLVMValueRef args[2] = {rsrc, offset};<br>
+                       result[i] = ac_build_intrinsic(ctx, "llvm.SI.load.const.v4i32",<br>
+                                                      ctx->f32, args, 2,<br>
+                                                      AC_FUNC_ATTR_READNONE |<br>
+                                                      AC_FUNC_ATTR_LEGACY);<br>
+               }<br>
+               if (num_channels == 1)<br>
+                       return result[0];<br>
+<br>
+               if (num_channels == 3)<br>
+                       result[num_channels++] = LLVMGetUndef(ctx->f32);<br>
+               return ac_build_gather_values(ctx, result, num_channels);<br>
+       }<br>
+<br>
        unsigned func = CLAMP(num_channels, 1, 3) - 1;<br>
        LLVMValueRef args[] = {<br>
                LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, ""),<br>
                vindex ? vindex : LLVMConstInt(ctx->i32, 0, 0),<br>
-               LLVMConstInt(ctx->i32, inst_offset, 0),<br>
+               offset,<br>
                LLVMConstInt(ctx->i1, glc, 0),<br>
                LLVMConstInt(ctx->i1, slc, 0)<br>
        };<br>
        LLVMTypeRef types[] = {ctx->f32, LLVMVectorType(ctx->f32, 2),<br>
                               ctx->v4f32};<br>
        const char *type_names[] = {"f32", "v2f32", "v4f32"};<br>
        char name[256];<br>
  -     if (voffset) {<br>
-               args[2] = LLVMBuildAdd(ctx->builder, args[2], voffset,<br>
-                               "");<br>
-       }<br>
-<br>
-       if (soffset) {<br>
-               args[2] = LLVMBuildAdd(ctx->builder, args[2], soffset,<br>
-                               "");<br>
-       }<br>
-<br>
        snprintf(name, sizeof(name), "llvm.amdgcn.buffer.load.%s",<br>
                 type_names[func]);<br>
        return ac_build_intrinsic(ctx, name, types[func], args,<br>
                                  ARRAY_SIZE(args),<br>
                                  /* READNONE means writes can't affect it, while<br>
                                   * READONLY means that writes can affect it. */<br>
                                  can_speculate && HAVE_LLVM >= 0x0400 ?<br>
                                          AC_FUNC_ATTR_READNONE :<br>
                                          AC_FUNC_ATTR_READONLY);<br>
diff --git a/src/amd/common/ac_llvm_build<wbr>.h b/src/amd/common/ac_llvm_build<wbr>.h<br>
index c1b5f3d..ebb78fb 100644<br>
--- a/src/amd/common/ac_llvm_build<wbr>.h<br>
+++ b/src/amd/common/ac_llvm_build<wbr>.h<br>
@@ -136,21 +136,22 @@ ac_build_buffer_store_dword(st<wbr>ruct ac_llvm_context *ctx,<br>
  LLVMValueRef<br>
  ac_build_buffer_load(struct ac_llvm_context *ctx,<br>
                     LLVMValueRef rsrc,<br>
                     int num_channels,<br>
                     LLVMValueRef vindex,<br>
                     LLVMValueRef voffset,<br>
                     LLVMValueRef soffset,<br>
                     unsigned inst_offset,<br>
                     unsigned glc,<br>
                     unsigned slc,<br>
-                    bool can_speculate);<br>
+                    bool can_speculate,<br>
+                    bool allow_smem);<br>
    LLVMValueRef ac_build_buffer_load_format(st<wbr>ruct ac_llvm_context *ctx,<br>
                                         LLVMValueRef rsrc,<br>
                                         LLVMValueRef vindex,<br>
                                         LLVMValueRef voffset,<br>
                                         bool can_speculate);<br>
    LLVMValueRef<br>
  ac_get_thread_id(struct ac_llvm_context *ctx);<br>
  diff --git a/src/amd/common/ac_nir_to_llv<wbr>m.c b/src/amd/common/ac_nir_to_llv<wbr>m.c<br>
index 8ae0a75..28ba47d 100644<br>
--- a/src/amd/common/ac_nir_to_llv<wbr>m.c<br>
+++ b/src/amd/common/ac_nir_to_llv<wbr>m.c<br>
@@ -2816,21 +2816,21 @@ load_tes_input(struct nir_to_llvm_context *ctx,<br>
        param = shader_io_get_unique_index(ins<wbr>tr->variables[0]->var->data.<wbr>location);<br>
        if (instr->variables[0]->var->dat<wbr>a.location == VARYING_SLOT_CLIP_DIST0 &&<br>
            is_compact && const_index > 3) {<br>
                const_index -= 3;<br>
                param++;<br>
        }<br>
        buf_addr = get_tcs_tes_buffer_address_par<wbr>ams(ctx, param, const_index,<br>
                                                     is_compact, vertex_index, indir_index);<br>
        result = ac_build_buffer_load(&ctx->ac, ctx->hs_ring_tess_offchip, instr->num_components, NULL,<br>
-                                     buf_addr, ctx->oc_lds, is_compact ? (4 * const_index) : 0, 1, 0, true);<br>
+                                     buf_addr, ctx->oc_lds, is_compact ? (4 * const_index) : 0, 1, 0, true, false);<br>
        result = trim_vector(ctx, result, instr->num_components);<br>
        result = LLVMBuildBitCast(ctx->builder, result, get_def_type(ctx, &instr->dest.ssa), "");<br>
        return result;<br>
  }<br>
    static LLVMValueRef<br>
  load_gs_input(struct nir_to_llvm_context *ctx,<br>
              nir_intrinsic_instr *instr)<br>
  {<br>
        LLVMValueRef indir_index, vtx_offset;<br>
diff --git a/src/gallium/drivers/radeonsi<wbr>/si_shader.c b/src/gallium/drivers/radeonsi<wbr>/si_shader.c<br>
index fe0cd4a..c2b3cae 100644<br>
--- a/src/gallium/drivers/radeonsi<wbr>/si_shader.c<br>
+++ b/src/gallium/drivers/radeonsi<wbr>/si_shader.c<br>
@@ -826,39 +826,39 @@ static LLVMValueRef buffer_load(struct lp_build_tgsi_context *bld_base,<br>
                                  LLVMValueRef base, bool can_speculate)<br>
  {<br>
        struct si_shader_context *ctx = si_shader_context(bld_base);<br>
        struct gallivm_state *gallivm = &ctx->gallivm;<br>
        LLVMValueRef value, value2;<br>
        LLVMTypeRef llvm_type = tgsi2llvmtype(bld_base, type);<br>
        LLVMTypeRef vec_type = LLVMVectorType(llvm_type, 4);<br>
        if (swizzle == ~0) {<br>
                value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL, base, offset,<br>
-                                            0, 1, 0, can_speculate);<br>
+                                            0, 1, 0, can_speculate, false);<br>
                return LLVMBuildBitCast(gallivm->buil<wbr>der, value, vec_type, "");<br>
        }<br>
        if (!tgsi_type_is_64bit(type)) {<br>
                value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL, base, offset,<br>
-                                            0, 1, 0, can_speculate);<br>
+                                            0, 1, 0, can_speculate, false);<br>
                value = LLVMBuildBitCast(gallivm->buil<wbr>der, value, vec_type, "");<br>
                return LLVMBuildExtractElement(galliv<wbr>m->builder, value,<br>
                                    LLVMConstInt(ctx->i32, swizzle, 0), "");<br>
        }<br>
        value = ac_build_buffer_load(&ctx->ac, buffer, 1, NULL, base, offset,<br>
-                                 swizzle * 4, 1, 0, can_speculate);<br>
+                                 swizzle * 4, 1, 0, can_speculate, false);<br>
        value2 = ac_build_buffer_load(&ctx->ac, buffer, 1, NULL, base, offset,<br>
-                                  swizzle * 4 + 4, 1, 0, can_speculate);<br>
+                                  swizzle * 4 + 4, 1, 0, can_speculate, false);<br>
        return si_llvm_emit_fetch_64bit(bld_b<wbr>ase, type, value, value2);<br>
  }<br>
    /**<br>
   * Load from LDS.<br>
   *<br>
   * \param type                output value type<br>
   * \param swizzle     offset (typically 0..3); it can be ~0, which loads a vec4<br>
   * \param dw_addr     address in dwords<br>
@@ -1147,28 +1147,28 @@ static LLVMValueRef fetch_input_gs(<br>
                vtx_offset_param += ctx->param_gs_vtx2_offset - 2;<br>
        }<br>
        vtx_offset = lp_build_mul_imm(uint,<br>
                                      LLVMGetParam(ctx->main_fn,<br>
                                                   vtx_offset_param),<br>
                                      4);<br>
        soffset = LLVMConstInt(ctx->i32, (param * 4 + swizzle) * 256, 0);<br>
        value = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1, ctx->i32_0,<br>
-                                    vtx_offset, soffset, 0, 1, 0, true);<br>
+                                    vtx_offset, soffset, 0, 1, 0, true, false);<br>
        if (tgsi_type_is_64bit(type)) {<br>
                LLVMValueRef value2;<br>
                soffset = LLVMConstInt(ctx->i32, (param * 4 + swizzle + 1) * 256, 0);<br>
                value2 = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1,<br>
                                              ctx->i32_0, vtx_offset, soffset,<br>
-                                             0, 1, 0, true);<br>
+                                             0, 1, 0, true, false);<br>
                return si_llvm_emit_fetch_64bit(bld_b<wbr>ase, type,<br>
                                                value, value2);<br>
        }<br>
        return LLVMBuildBitCast(gallivm->buil<wbr>der,<br>
                                value,<br>
                                tgsi2llvmtype(bld_base, type), "");<br>
  }<br>
    static int lookup_interp_param_index(unsi<wbr>gned interpolate, unsigned location)<br>
  {<br>
@@ -1382,26 +1382,22 @@ static LLVMValueRef get_sample_id(struct si_shader_context *ctx)<br>
  }<br>
      /**<br>
   * Load a dword from a constant buffer.<br>
   */<br>
  static LLVMValueRef buffer_load_const(struct si_shader_context *ctx,<br>
                                      LLVMValueRef resource,<br>
                                      LLVMValueRef offset)<br>
  {<br>
-       LLVMBuilderRef builder = ctx->gallivm.builder;<br>
-       LLVMValueRef args[2] = {resource, offset};<br>
-<br>
-       return lp_build_intrinsic(builder, "llvm.SI.load.const.v4i32", ctx->f32, args, 2,<br>
-                                 LP_FUNC_ATTR_READNONE |<br>
-                                 LP_FUNC_ATTR_LEGACY);<br>
+       return ac_build_buffer_load(&ctx->ac, resource, 1, NULL, offset, NULL,<br>
+                                   0, 0, 0, true, true);<br>
  }<br>
    static LLVMValueRef load_sample_position(struct si_shader_context *ctx, LLVMValueRef sample_id)<br>
  {<br>
        struct lp_build_context *uint_bld = &ctx->bld_base.uint_bld;<br>
        struct gallivm_state *gallivm = &ctx->gallivm;<br>
        LLVMBuilderRef builder = gallivm->builder;<br>
        LLVMValueRef desc = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers);<br>
        LLVMValueRef buf_index = LLVMConstInt(ctx->i32, SI_PS_CONST_SAMPLE_POSITIONS, 0);<br>
        LLVMValueRef resource = ac_build_indexed_load_const(&c<wbr>tx->ac, desc, buf_index);<br>
@@ -5201,21 +5197,22 @@ si_generate_gs_copy_shader(str<wbr>uct si_screen *sscreen,<br>
                                }<br>
                                LLVMValueRef soffset = LLVMConstInt(ctx.i32,<br>
                                        offset * gs_selector->gs_max_out_vertic<wbr>es * 16 * 4, 0);<br>
                                offset++;<br>
                                outputs[i].values[chan] =<br>
                                        ac_build_buffer_load(&<a href="http://ctx.ac" rel="noreferrer" target="_blank">ctx.ac</a>,<br>
                                                             ctx.gsvs_ring[0], 1,<br>
                                                             ctx.i32_0, voffset,<br>
-                                                            soffset, 0, 1, 1, true);<br>
+                                                            soffset, 0, 1, 1,<br>
+                                                            true, false);<br>
                        }<br>
                }<br>
                /* Streamout and exports. */<br>
                if (gs_selector->so.num_outputs) {<br>
                        si_llvm_emit_streamout(&ctx, outputs,<br>
                                               gsinfo->num_outputs,<br>
                                               stream);<br>
                }<br>
  <br>
</blockquote>
<br>
<br>
-- <br></div>
Lerne, wie die Welt wirklich ist,<br>
Aber vergiss niemals, wie sie sein sollte.<br>
</blockquote></div><br></div></div></div>