[Mesa-dev] [PATCH] ac: Use mov_dpp for derivatives.
Connor Abbott
cwabbott0 at gmail.com
Thu Jun 15 19:02:37 UTC 2017
On Jun 15, 2017 3:31 AM, "Nicolai Hähnle" <nhaehnle at gmail.com> wrote:
On 15.06.2017 00:36, Connor Abbott wrote:
> I was looking into WQM stuff today, and I realized that LLVM will no
> longer mark this instruction as needing WQM, which seems like a
> problem. Seems like we need a patch to LLVM. Other uses of DPP (e.g.
> for the subgroup reduction stuff) won't want WQM, so I'm not sure
> what's the best approach there. If we add an attribute, will LLVM
> guarantee that we won't remove it?
>
Ah, that sucks, I didn't think of that either.
By attribute, you mean metadata? Unfortunately, metadata is not guaranteed
to preserved.
One idea would be to add an i1 function argument to the mov.dpp intrinsic
which enables WQM. (Which might mean adding a new intrinsic, actually; so
perhaps a mov.dpp.wqm intrinsic?)
Cheers,
Nicolai
Well, from the discussion on llvm-dev it seems like we want a new intrinsic
anyways, since llvm.amdgcn.mov.dpp doesn't give you any control over what
the value in the unwritten lanes is. And for the reduction stuff, we also
want something like "whole wavefront mode" where we set EXEC to its
original value at the start of the program (or maybe just ~0), so I guess
we want an i2 argument.
Connor
> On Sat, Jun 10, 2017 at 1:05 PM, Bas Nieuwenhuizen
> <bas at basnieuwenhuizen.nl> wrote:
>
>> Slightly faster than bpermute, and seems supported since at least
>> LLVM 3.9.
>>
>> v2: Since this supersedes bpermute, remove the bpermute code.
>> Signed-off-by: Bas Nieuwenhuizen <basni at google.com>
>> ---
>> src/amd/common/ac_llvm_build.c | 47
>> ++++++++++++++++++++------------
>> src/amd/common/ac_llvm_build.h | 2 +-
>> src/amd/common/ac_nir_to_llvm.c | 8 +++---
>> src/gallium/drivers/radeonsi/si_pipe.c | 2 +-
>> src/gallium/drivers/radeonsi/si_pipe.h | 2 +-
>> src/gallium/drivers/radeonsi/si_shader.c | 4 +--
>> 6 files changed, 38 insertions(+), 27 deletions(-)
>>
>> diff --git a/src/amd/common/ac_llvm_build.c
>> b/src/amd/common/ac_llvm_build.c
>> index 237e9291d41..99d41bf52d6 100644
>> --- a/src/amd/common/ac_llvm_build.c
>> +++ b/src/amd/common/ac_llvm_build.c
>> @@ -783,41 +783,52 @@ ac_get_thread_id(struct ac_llvm_context *ctx)
>> */
>> LLVMValueRef
>> ac_build_ddxy(struct ac_llvm_context *ctx,
>> - bool has_ds_bpermute,
>> + bool has_mov_dpp,
>> uint32_t mask,
>> int idx,
>> LLVMValueRef lds,
>> LLVMValueRef val)
>> {
>> - LLVMValueRef thread_id, tl, trbl, tl_tid, trbl_tid, args[2];
>> + LLVMValueRef thread_id, tl, trbl, args[5];
>> LLVMValueRef result;
>>
>> - thread_id = ac_get_thread_id(ctx);
>> + if (has_mov_dpp) {
>> + uint32_t tl_ctrl = 0, trbl_ctrl = 0;
>>
>> - tl_tid = LLVMBuildAnd(ctx->builder, thread_id,
>> - LLVMConstInt(ctx->i32, mask, false), "");
>> -
>> - trbl_tid = LLVMBuildAdd(ctx->builder, tl_tid,
>> - LLVMConstInt(ctx->i32, idx, false), "");
>> + for (unsigned i = 0; i < 4; ++i) {
>> + tl_ctrl |= (i & mask) << (2 * i);
>> + trbl_ctrl |= ((i & mask) + idx) << (2 * i);
>> + }
>>
>> - if (has_ds_bpermute) {
>> - args[0] = LLVMBuildMul(ctx->builder, tl_tid,
>> - LLVMConstInt(ctx->i32, 4, false),
>> "");
>> - args[1] = val;
>> + args[0] = val;
>> + args[1] = LLVMConstInt(ctx->i32, tl_ctrl, false);
>> + args[2] = LLVMConstInt(ctx->i32, 0xf, false);
>> + args[3] = LLVMConstInt(ctx->i32, 0xf, false);
>> + args[4] = LLVMConstInt(ctx->i1, 1, false);
>> tl = ac_build_intrinsic(ctx,
>> - "llvm.amdgcn.ds.bpermute",
>> ctx->i32,
>> - args, 2,
>> + "llvm.amdgcn.mov.dpp.i32",
>> ctx->i32,
>> + args, 5,
>> AC_FUNC_ATTR_READNONE |
>> AC_FUNC_ATTR_CONVERGENT);
>>
>> - args[0] = LLVMBuildMul(ctx->builder, trbl_tid,
>> - LLVMConstInt(ctx->i32, 4, false),
>> "");
>> + args[1] = LLVMConstInt(ctx->i32, trbl_ctrl, false);
>> trbl = ac_build_intrinsic(ctx,
>> - "llvm.amdgcn.ds.bpermute",
>> ctx->i32,
>> - args, 2,
>> + "llvm.amdgcn.mov.dpp.i32",
>> ctx->i32,
>> + args, 5,
>> AC_FUNC_ATTR_READNONE |
>> AC_FUNC_ATTR_CONVERGENT);
>> } else {
>> + LLVMValueRef tl_tid, trbl_tid;
>> +
>> + thread_id = ac_get_thread_id(ctx);
>> +
>> + tl_tid = LLVMBuildAnd(ctx->builder, thread_id,
>> + LLVMConstInt(ctx->i32, mask, false), "");
>> +
>> + trbl_tid = LLVMBuildAdd(ctx->builder, tl_tid,
>> + LLVMConstInt(ctx->i32, idx,
>> false), "");
>> +
>> +
>> LLVMValueRef store_ptr, load_ptr0, load_ptr1;
>>
>> store_ptr = ac_build_gep0(ctx, lds, thread_id);
>> diff --git a/src/amd/common/ac_llvm_build.h
>> b/src/amd/common/ac_llvm_build.h
>> index ebb78fbd79b..14260b05018 100644
>> --- a/src/amd/common/ac_llvm_build.h
>> +++ b/src/amd/common/ac_llvm_build.h
>> @@ -161,7 +161,7 @@ ac_get_thread_id(struct ac_llvm_context *ctx);
>>
>> LLVMValueRef
>> ac_build_ddxy(struct ac_llvm_context *ctx,
>> - bool has_ds_bpermute,
>> + bool has_mov_dpp,
>> uint32_t mask,
>> int idx,
>> LLVMValueRef lds,
>> diff --git a/src/amd/common/ac_nir_to_llvm.c
>> b/src/amd/common/ac_nir_to_llvm.c
>> index 49117d21bd2..2385c60d316 100644
>> --- a/src/amd/common/ac_nir_to_llvm.c
>> +++ b/src/amd/common/ac_nir_to_llvm.c
>> @@ -164,7 +164,7 @@ struct nir_to_llvm_context {
>> uint8_t num_output_clips;
>> uint8_t num_output_culls;
>>
>> - bool has_ds_bpermute;
>> + bool has_mov_dpp;
>>
>> bool is_gs_copy_shader;
>> LLVMValueRef gs_next_vertex;
>> @@ -1434,7 +1434,7 @@ static LLVMValueRef emit_ddxy(struct
>> nir_to_llvm_context *ctx,
>> LLVMValueRef result;
>> ctx->has_ddxy = true;
>>
>> - if (!ctx->lds && !ctx->has_ds_bpermute)
>> + if (!ctx->lds && !ctx->has_mov_dpp)
>> ctx->lds = LLVMAddGlobalInAddressSpace(ctx->module,
>>
>> LLVMArrayType(ctx->i32, 64),
>> "ddxy_lds",
>> LOCAL_ADDR_SPACE);
>> @@ -1454,7 +1454,7 @@ static LLVMValueRef emit_ddxy(struct
>> nir_to_llvm_context *ctx,
>> else
>> idx = 2;
>>
>> - result = ac_build_ddxy(&ctx->ac, ctx->has_ds_bpermute,
>> + result = ac_build_ddxy(&ctx->ac, ctx->has_mov_dpp,
>> mask, idx, ctx->lds,
>> src0);
>> return result;
>> @@ -5858,7 +5858,7 @@ LLVMModuleRef ac_translate_nir_to_llvm(LLVMTargetMachineRef
>> tm,
>> ac_llvm_context_init(&ctx.ac, ctx.context);
>> ctx.ac.module = ctx.module;
>>
>> - ctx.has_ds_bpermute = ctx.options->chip_class >= VI;
>> + ctx.has_mov_dpp = ctx.options->chip_class >= VI;
>>
>> memset(shader_info, 0, sizeof(*shader_info));
>>
>> diff --git a/src/gallium/drivers/radeonsi/si_pipe.c
>> b/src/gallium/drivers/radeonsi/si_pipe.c
>> index cb372267cde..7e83d5e5ac4 100644
>> --- a/src/gallium/drivers/radeonsi/si_pipe.c
>> +++ b/src/gallium/drivers/radeonsi/si_pipe.c
>> @@ -944,7 +944,7 @@ struct pipe_screen *radeonsi_screen_create(struct
>> radeon_winsys *ws)
>> sscreen->b.info.pfp_fw_version >= 121 &&
>> sscreen->b.info.me_fw_version >= 87);
>>
>> - sscreen->has_ds_bpermute = sscreen->b.chip_class >= VI;
>> + sscreen->has_mov_dpp = sscreen->b.chip_class >= VI;
>> sscreen->has_msaa_sample_loc_bug = (sscreen->b.family >=
>> CHIP_POLARIS10 &&
>> sscreen->b.family <=
>> CHIP_POLARIS12) ||
>> sscreen->b.family ==
>> CHIP_VEGA10 ||
>> diff --git a/src/gallium/drivers/radeonsi/si_pipe.h
>> b/src/gallium/drivers/radeonsi/si_pipe.h
>> index 108929c10c6..ef139fb0cd3 100644
>> --- a/src/gallium/drivers/radeonsi/si_pipe.h
>> +++ b/src/gallium/drivers/radeonsi/si_pipe.h
>> @@ -79,7 +79,7 @@ struct si_screen {
>> unsigned tess_offchip_block_dw_size;
>> bool has_distributed_tess;
>> bool has_draw_indirect_multi;
>> - bool has_ds_bpermute;
>> + bool has_mov_dpp;
>> bool has_msaa_sample_loc_bug;
>>
>> /* Whether shaders are monolithic (1-part) or separate (3-part).
>> */
>> diff --git a/src/gallium/drivers/radeonsi/si_shader.c
>> b/src/gallium/drivers/radeonsi/si_shader.c
>> index 2c92269a575..2eed45d79a5 100644
>> --- a/src/gallium/drivers/radeonsi/si_shader.c
>> +++ b/src/gallium/drivers/radeonsi/si_shader.c
>> @@ -3442,7 +3442,7 @@ static void si_llvm_emit_ddxy(
>> idx = (opcode == TGSI_OPCODE_DDX || opcode ==
>> TGSI_OPCODE_DDX_FINE) ? 1 : 2;
>>
>> val = LLVMBuildBitCast(gallivm->builder, emit_data->args[0],
>> ctx->i32, "");
>> - val = ac_build_ddxy(&ctx->ac, ctx->screen->has_ds_bpermute,
>> + val = ac_build_ddxy(&ctx->ac, ctx->screen->has_mov_dpp,
>> mask, idx, ctx->lds, val);
>> emit_data->output[emit_data->chan] = val;
>> }
>> @@ -4454,7 +4454,7 @@ static void create_function(struct
>> si_shader_context *ctx)
>> assert(shader->info.num_input_vgprs >= num_prolog_vgprs);
>> shader->info.num_input_vgprs -= num_prolog_vgprs;
>>
>> - if (!ctx->screen->has_ds_bpermute &&
>> + if (!ctx->screen->has_mov_dpp &&
>> bld_base->info &&
>> (bld_base->info->opcode_count[TGSI_OPCODE_DDX] > 0 ||
>> bld_base->info->opcode_count[TGSI_OPCODE_DDY] > 0 ||
>> --
>> 2.13.0
>>
>> _______________________________________________
>> mesa-dev mailing list
>> mesa-dev at lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
>>
> _______________________________________________
> mesa-dev mailing list
> mesa-dev at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
>
>
--
Lerne, wie die Welt wirklich ist,
Aber vergiss niemals, wie sie sein sollte.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.freedesktop.org/archives/mesa-dev/attachments/20170615/8ae03b05/attachment-0001.html>
More information about the mesa-dev
mailing list