<div dir="ltr"><div dir="auto"><div><br><div class="gmail_extra"><br><div class="gmail_quote">On Jun 15, 2017 3:31 AM, "Nicolai Hähnle" <<a href="mailto:nhaehnle@gmail.com" target="_blank">nhaehnle@gmail.com</a>> wrote:<br type="attribution"><blockquote class="m_-2912746277332618109quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"><div class="m_-2912746277332618109quoted-text">On 15.06.2017 00:36, Connor Abbott wrote:<br>
<blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
I was looking into WQM stuff today, and I realized that LLVM will no<br>
longer mark this instruction as needing WQM, which seems like a<br>
problem. Seems like we need a patch to LLVM. Other uses of DPP (e.g.<br>
for the subgroup reduction stuff) won't want WQM, so I'm not sure<br>
what's the best approach there. If we add an attribute, will LLVM<br>
guarantee that we won't remove it?<br>
</blockquote>
<br></div>
Ah, that sucks, I didn't think of that either.<br>
<br>
By attribute, you mean metadata? Unfortunately, metadata is not guaranteed to preserved.<br>
<br>
One idea would be to add an i1 function argument to the mov.dpp intrinsic which enables WQM. (Which might mean adding a new intrinsic, actually; so perhaps a mov.dpp.wqm intrinsic?)<br>
<br>
Cheers,<br>
Nicolai</blockquote></div></div></div><div dir="auto"><br></div><div dir="auto">Well, from the discussion on llvm-dev it seems like we want a new intrinsic anyways, since llvm.amdgcn.mov.dpp doesn't give you any control over what the value in the unwritten lanes is. And for the reduction stuff, we also want something like "whole wavefront mode" where we set EXEC to its original value at the start of the program (or maybe just ~0), so I guess we want an i2 argument.<br><br></div><div>Connor<br></div><div dir="auto"><br></div><div dir="auto"><div class="gmail_extra"><div class="gmail_quote"><blockquote class="m_-2912746277332618109quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"><div class="m_-2912746277332618109elided-text"><br>
<br>
<br>
<br>
<blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
<br>
On Sat, Jun 10, 2017 at 1:05 PM, Bas Nieuwenhuizen<br>
<<a href="mailto:bas@basnieuwenhuizen.nl" target="_blank">bas@basnieuwenhuizen.nl</a>> wrote:<br>
<blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
Slightly faster than bpermute, and seems supported since at least<br>
LLVM 3.9.<br>
<br>
v2: Since this supersedes bpermute, remove the bpermute code.<br>
Signed-off-by: Bas Nieuwenhuizen <<a href="mailto:basni@google.com" target="_blank">basni@google.com</a>><br>
---<br>
  src/amd/common/ac_llvm_build.c<wbr>           | 47 ++++++++++++++++++++----------<wbr>--<br>
  src/amd/common/ac_llvm_build.h<wbr>           |  2 +-<br>
  src/amd/common/ac_nir_to_llvm.<wbr>c          |  8 +++---<br>
  src/gallium/drivers/radeonsi/s<wbr>i_pipe.c   |  2 +-<br>
  src/gallium/drivers/radeonsi/s<wbr>i_pipe.h   |  2 +-<br>
  src/gallium/drivers/radeonsi/s<wbr>i_shader.c |  4 +--<br>
  6 files changed, 38 insertions(+), 27 deletions(-)<br>
<br>
diff --git a/src/amd/common/ac_llvm_build<wbr>.c b/src/amd/common/ac_llvm_build<wbr>.c<br>
index 237e9291d41..99d41bf52d6 100644<br>
--- a/src/amd/common/ac_llvm_build<wbr>.c<br>
+++ b/src/amd/common/ac_llvm_build<wbr>.c<br>
@@ -783,41 +783,52 @@ ac_get_thread_id(struct ac_llvm_context *ctx)<br>
   */<br>
  LLVMValueRef<br>
  ac_build_ddxy(struct ac_llvm_context *ctx,<br>
-             bool has_ds_bpermute,<br>
+             bool has_mov_dpp,<br>
               uint32_t mask,<br>
               int idx,<br>
               LLVMValueRef lds,<br>
               LLVMValueRef val)<br>
  {<br>
-       LLVMValueRef thread_id, tl, trbl, tl_tid, trbl_tid, args[2];<br>
+       LLVMValueRef thread_id, tl, trbl, args[5];<br>
         LLVMValueRef result;<br>
<br>
-       thread_id = ac_get_thread_id(ctx);<br>
+       if (has_mov_dpp) {<br>
+               uint32_t tl_ctrl = 0, trbl_ctrl = 0;<br>
<br>
-       tl_tid = LLVMBuildAnd(ctx->builder, thread_id,<br>
-                             LLVMConstInt(ctx->i32, mask, false), "");<br>
-<br>
-       trbl_tid = LLVMBuildAdd(ctx->builder, tl_tid,<br>
-                               LLVMConstInt(ctx->i32, idx, false), "");<br>
+               for (unsigned i = 0; i < 4; ++i) {<br>
+                       tl_ctrl |= (i & mask) << (2 * i);<br>
+                       trbl_ctrl |= ((i & mask) + idx) << (2 * i);<br>
+               }<br>
<br>
-       if (has_ds_bpermute) {<br>
-               args[0] = LLVMBuildMul(ctx->builder, tl_tid,<br>
-                                      LLVMConstInt(ctx->i32, 4, false), "");<br>
-               args[1] = val;<br>
+               args[0] = val;<br>
+               args[1] = LLVMConstInt(ctx->i32, tl_ctrl, false);<br>
+               args[2] = LLVMConstInt(ctx->i32, 0xf, false);<br>
+               args[3] = LLVMConstInt(ctx->i32, 0xf, false);<br>
+               args[4] = LLVMConstInt(ctx->i1, 1, false);<br>
                 tl = ac_build_intrinsic(ctx,<br>
-                                       "llvm.amdgcn.ds.bpermute", ctx->i32,<br>
-                                       args, 2,<br>
+                                       "llvm.amdgcn.mov.dpp.i32", ctx->i32,<br>
+                                       args, 5,<br>
                                         AC_FUNC_ATTR_READNONE |<br>
                                         AC_FUNC_ATTR_CONVERGENT);<br>
<br>
-               args[0] = LLVMBuildMul(ctx->builder, trbl_tid,<br>
-                                      LLVMConstInt(ctx->i32, 4, false), "");<br>
+               args[1] = LLVMConstInt(ctx->i32, trbl_ctrl, false);<br>
                 trbl = ac_build_intrinsic(ctx,<br>
-                                         "llvm.amdgcn.ds.bpermute", ctx->i32,<br>
-                                         args, 2,<br>
+                                         "llvm.amdgcn.mov.dpp.i32", ctx->i32,<br>
+                                         args, 5,<br>
                                           AC_FUNC_ATTR_READNONE |<br>
                                           AC_FUNC_ATTR_CONVERGENT);<br>
         } else {<br>
+               LLVMValueRef tl_tid, trbl_tid;<br>
+<br>
+               thread_id = ac_get_thread_id(ctx);<br>
+<br>
+               tl_tid = LLVMBuildAnd(ctx->builder, thread_id,<br>
+                               LLVMConstInt(ctx->i32, mask, false), "");<br>
+<br>
+               trbl_tid = LLVMBuildAdd(ctx->builder, tl_tid,<br>
+                                       LLVMConstInt(ctx->i32, idx, false), "");<br>
+<br>
+<br>
                 LLVMValueRef store_ptr, load_ptr0, load_ptr1;<br>
<br>
                 store_ptr = ac_build_gep0(ctx, lds, thread_id);<br>
diff --git a/src/amd/common/ac_llvm_build<wbr>.h b/src/amd/common/ac_llvm_build<wbr>.h<br>
index ebb78fbd79b..14260b05018 100644<br>
--- a/src/amd/common/ac_llvm_build<wbr>.h<br>
+++ b/src/amd/common/ac_llvm_build<wbr>.h<br>
@@ -161,7 +161,7 @@ ac_get_thread_id(struct ac_llvm_context *ctx);<br>
<br>
  LLVMValueRef<br>
  ac_build_ddxy(struct ac_llvm_context *ctx,<br>
-             bool has_ds_bpermute,<br>
+             bool has_mov_dpp,<br>
               uint32_t mask,<br>
               int idx,<br>
               LLVMValueRef lds,<br>
diff --git a/src/amd/common/ac_nir_to_llv<wbr>m.c b/src/amd/common/ac_nir_to_llv<wbr>m.c<br>
index 49117d21bd2..2385c60d316 100644<br>
--- a/src/amd/common/ac_nir_to_llv<wbr>m.c<br>
+++ b/src/amd/common/ac_nir_to_llv<wbr>m.c<br>
@@ -164,7 +164,7 @@ struct nir_to_llvm_context {<br>
         uint8_t num_output_clips;<br>
         uint8_t num_output_culls;<br>
<br>
-       bool has_ds_bpermute;<br>
+       bool has_mov_dpp;<br>
<br>
         bool is_gs_copy_shader;<br>
         LLVMValueRef gs_next_vertex;<br>
@@ -1434,7 +1434,7 @@ static LLVMValueRef emit_ddxy(struct nir_to_llvm_context *ctx,<br>
         LLVMValueRef result;<br>
         ctx->has_ddxy = true;<br>
<br>
-       if (!ctx->lds && !ctx->has_ds_bpermute)<br>
+       if (!ctx->lds && !ctx->has_mov_dpp)<br>
                 ctx->lds = LLVMAddGlobalInAddressSpace(ct<wbr>x->module,<br>
                                                        LLVMArrayType(ctx->i32, 64),<br>
                                                        "ddxy_lds", LOCAL_ADDR_SPACE);<br>
@@ -1454,7 +1454,7 @@ static LLVMValueRef emit_ddxy(struct nir_to_llvm_context *ctx,<br>
         else<br>
                 idx = 2;<br>
<br>
-       result = ac_build_ddxy(&ctx->ac, ctx->has_ds_bpermute,<br>
+       result = ac_build_ddxy(&ctx->ac, ctx->has_mov_dpp,<br>
                               mask, idx, ctx->lds,<br>
                               src0);<br>
         return result;<br>
@@ -5858,7 +5858,7 @@ LLVMModuleRef ac_translate_nir_to_llvm(LLVMT<wbr>argetMachineRef tm,<br>
         ac_llvm_context_init(&<a href="http://ctx.ac" rel="noreferrer" target="_blank">ctx.ac</a>, ctx.context);<br>
         ctx.ac.module = ctx.module;<br>
<br>
-       ctx.has_ds_bpermute = ctx.options->chip_class >= VI;<br>
+       ctx.has_mov_dpp = ctx.options->chip_class >= VI;<br>
<br>
         memset(shader_info, 0, sizeof(*shader_info));<br>
<br>
diff --git a/src/gallium/drivers/radeonsi<wbr>/si_pipe.c b/src/gallium/drivers/radeonsi<wbr>/si_pipe.c<br>
index cb372267cde..7e83d5e5ac4 100644<br>
--- a/src/gallium/drivers/radeonsi<wbr>/si_pipe.c<br>
+++ b/src/gallium/drivers/radeonsi<wbr>/si_pipe.c<br>
@@ -944,7 +944,7 @@ struct pipe_screen *radeonsi_screen_create(struct radeon_winsys *ws)<br>
                  sscreen->b.info.pfp_fw_version >= 121 &&<br>
                  sscreen->b.info.me_fw_version >= 87);<br>
<br>
-       sscreen->has_ds_bpermute = sscreen->b.chip_class >= VI;<br>
+       sscreen->has_mov_dpp = sscreen->b.chip_class >= VI;<br>
         sscreen->has_msaa_sample_loc_<wbr>bug = (sscreen->b.family >= CHIP_POLARIS10 &&<br>
                                             sscreen->b.family <= CHIP_POLARIS12) ||<br>
                                            sscreen->b.family == CHIP_VEGA10 ||<br>
diff --git a/src/gallium/drivers/radeonsi<wbr>/si_pipe.h b/src/gallium/drivers/radeonsi<wbr>/si_pipe.h<br>
index 108929c10c6..ef139fb0cd3 100644<br>
--- a/src/gallium/drivers/radeonsi<wbr>/si_pipe.h<br>
+++ b/src/gallium/drivers/radeonsi<wbr>/si_pipe.h<br>
@@ -79,7 +79,7 @@ struct si_screen {<br>
         unsigned                        tess_offchip_block_dw_size;<br>
         bool                            has_distributed_tess;<br>
         bool                            has_draw_indirect_multi;<br>
-       bool                            has_ds_bpermute;<br>
+       bool                            has_mov_dpp;<br>
         bool                            has_msaa_sample_loc_bug;<br>
<br>
         /* Whether shaders are monolithic (1-part) or separate (3-part). */<br>
diff --git a/src/gallium/drivers/radeonsi<wbr>/si_shader.c b/src/gallium/drivers/radeonsi<wbr>/si_shader.c<br>
index 2c92269a575..2eed45d79a5 100644<br>
--- a/src/gallium/drivers/radeonsi<wbr>/si_shader.c<br>
+++ b/src/gallium/drivers/radeonsi<wbr>/si_shader.c<br>
@@ -3442,7 +3442,7 @@ static void si_llvm_emit_ddxy(<br>
         idx = (opcode == TGSI_OPCODE_DDX || opcode == TGSI_OPCODE_DDX_FINE) ? 1 : 2;<br>
<br>
         val = LLVMBuildBitCast(gallivm->buil<wbr>der, emit_data->args[0], ctx->i32, "");<br>
-       val = ac_build_ddxy(&ctx->ac, ctx->screen->has_ds_bpermute,<br>
+       val = ac_build_ddxy(&ctx->ac, ctx->screen->has_mov_dpp,<br>
                             mask, idx, ctx->lds, val);<br>
         emit_data->output[emit_data-><wbr>chan] = val;<br>
  }<br>
@@ -4454,7 +4454,7 @@ static void create_function(struct si_shader_context *ctx)<br>
         assert(shader->info.num_input<wbr>_vgprs >= num_prolog_vgprs);<br>
         shader->info.num_input_vgprs -= num_prolog_vgprs;<br>
<br>
-       if (!ctx->screen->has_ds_bpermute &&<br>
+       if (!ctx->screen->has_mov_dpp &&<br>
             bld_base->info &&<br>
             (bld_base->info->opcode_count<wbr>[TGSI_OPCODE_DDX] > 0 ||<br>
              bld_base->info->opcode_count[T<wbr>GSI_OPCODE_DDY] > 0 ||<br>
--<br>
2.13.0<br>
<br>
______________________________<wbr>_________________<br>
mesa-dev mailing list<br>
<a href="mailto:mesa-dev@lists.freedesktop.org" target="_blank">mesa-dev@lists.freedesktop.org</a><br>
<a href="https://lists.freedesktop.org/mailman/listinfo/mesa-dev" rel="noreferrer" target="_blank">https://lists.freedesktop.org/<wbr>mailman/listinfo/mesa-dev</a><br>
</blockquote>
______________________________<wbr>_________________<br>
mesa-dev mailing list<br>
<a href="mailto:mesa-dev@lists.freedesktop.org" target="_blank">mesa-dev@lists.freedesktop.org</a><br>
<a href="https://lists.freedesktop.org/mailman/listinfo/mesa-dev" rel="noreferrer" target="_blank">https://lists.freedesktop.org/<wbr>mailman/listinfo/mesa-dev</a><br>
<br>
</blockquote>
<br>
<br></div><div class="m_-2912746277332618109elided-text">
-- <br>
Lerne, wie die Welt wirklich ist,<br>
Aber vergiss niemals, wie sie sein sollte.<br>
</div></blockquote></div><br></div></div></div>
</div>