[Mesa-dev] [PATCH] ac: Use mov_dpp for derivatives.
Connor Abbott
cwabbott0 at gmail.com
Wed Jun 14 22:36:52 UTC 2017
I was looking into WQM stuff today, and I realized that LLVM will no
longer mark this instruction as needing WQM, which seems like a
problem. Seems like we need a patch to LLVM. Other uses of DPP (e.g.
for the subgroup reduction stuff) won't want WQM, so I'm not sure
what's the best approach there. If we add an attribute, will LLVM
guarantee that we won't remove it?
On Sat, Jun 10, 2017 at 1:05 PM, Bas Nieuwenhuizen
<bas at basnieuwenhuizen.nl> wrote:
> Slightly faster than bpermute, and seems supported since at least
> LLVM 3.9.
>
> v2: Since this supersedes bpermute, remove the bpermute code.
> Signed-off-by: Bas Nieuwenhuizen <basni at google.com>
> ---
> src/amd/common/ac_llvm_build.c | 47 ++++++++++++++++++++------------
> src/amd/common/ac_llvm_build.h | 2 +-
> src/amd/common/ac_nir_to_llvm.c | 8 +++---
> src/gallium/drivers/radeonsi/si_pipe.c | 2 +-
> src/gallium/drivers/radeonsi/si_pipe.h | 2 +-
> src/gallium/drivers/radeonsi/si_shader.c | 4 +--
> 6 files changed, 38 insertions(+), 27 deletions(-)
>
> diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c
> index 237e9291d41..99d41bf52d6 100644
> --- a/src/amd/common/ac_llvm_build.c
> +++ b/src/amd/common/ac_llvm_build.c
> @@ -783,41 +783,52 @@ ac_get_thread_id(struct ac_llvm_context *ctx)
> */
> LLVMValueRef
> ac_build_ddxy(struct ac_llvm_context *ctx,
> - bool has_ds_bpermute,
> + bool has_mov_dpp,
> uint32_t mask,
> int idx,
> LLVMValueRef lds,
> LLVMValueRef val)
> {
> - LLVMValueRef thread_id, tl, trbl, tl_tid, trbl_tid, args[2];
> + LLVMValueRef thread_id, tl, trbl, args[5];
> LLVMValueRef result;
>
> - thread_id = ac_get_thread_id(ctx);
> + if (has_mov_dpp) {
> + uint32_t tl_ctrl = 0, trbl_ctrl = 0;
>
> - tl_tid = LLVMBuildAnd(ctx->builder, thread_id,
> - LLVMConstInt(ctx->i32, mask, false), "");
> -
> - trbl_tid = LLVMBuildAdd(ctx->builder, tl_tid,
> - LLVMConstInt(ctx->i32, idx, false), "");
> + for (unsigned i = 0; i < 4; ++i) {
> + tl_ctrl |= (i & mask) << (2 * i);
> + trbl_ctrl |= ((i & mask) + idx) << (2 * i);
> + }
>
> - if (has_ds_bpermute) {
> - args[0] = LLVMBuildMul(ctx->builder, tl_tid,
> - LLVMConstInt(ctx->i32, 4, false), "");
> - args[1] = val;
> + args[0] = val;
> + args[1] = LLVMConstInt(ctx->i32, tl_ctrl, false);
> + args[2] = LLVMConstInt(ctx->i32, 0xf, false);
> + args[3] = LLVMConstInt(ctx->i32, 0xf, false);
> + args[4] = LLVMConstInt(ctx->i1, 1, false);
> tl = ac_build_intrinsic(ctx,
> - "llvm.amdgcn.ds.bpermute", ctx->i32,
> - args, 2,
> + "llvm.amdgcn.mov.dpp.i32", ctx->i32,
> + args, 5,
> AC_FUNC_ATTR_READNONE |
> AC_FUNC_ATTR_CONVERGENT);
>
> - args[0] = LLVMBuildMul(ctx->builder, trbl_tid,
> - LLVMConstInt(ctx->i32, 4, false), "");
> + args[1] = LLVMConstInt(ctx->i32, trbl_ctrl, false);
> trbl = ac_build_intrinsic(ctx,
> - "llvm.amdgcn.ds.bpermute", ctx->i32,
> - args, 2,
> + "llvm.amdgcn.mov.dpp.i32", ctx->i32,
> + args, 5,
> AC_FUNC_ATTR_READNONE |
> AC_FUNC_ATTR_CONVERGENT);
> } else {
> + LLVMValueRef tl_tid, trbl_tid;
> +
> + thread_id = ac_get_thread_id(ctx);
> +
> + tl_tid = LLVMBuildAnd(ctx->builder, thread_id,
> + LLVMConstInt(ctx->i32, mask, false), "");
> +
> + trbl_tid = LLVMBuildAdd(ctx->builder, tl_tid,
> + LLVMConstInt(ctx->i32, idx, false), "");
> +
> +
> LLVMValueRef store_ptr, load_ptr0, load_ptr1;
>
> store_ptr = ac_build_gep0(ctx, lds, thread_id);
> diff --git a/src/amd/common/ac_llvm_build.h b/src/amd/common/ac_llvm_build.h
> index ebb78fbd79b..14260b05018 100644
> --- a/src/amd/common/ac_llvm_build.h
> +++ b/src/amd/common/ac_llvm_build.h
> @@ -161,7 +161,7 @@ ac_get_thread_id(struct ac_llvm_context *ctx);
>
> LLVMValueRef
> ac_build_ddxy(struct ac_llvm_context *ctx,
> - bool has_ds_bpermute,
> + bool has_mov_dpp,
> uint32_t mask,
> int idx,
> LLVMValueRef lds,
> diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
> index 49117d21bd2..2385c60d316 100644
> --- a/src/amd/common/ac_nir_to_llvm.c
> +++ b/src/amd/common/ac_nir_to_llvm.c
> @@ -164,7 +164,7 @@ struct nir_to_llvm_context {
> uint8_t num_output_clips;
> uint8_t num_output_culls;
>
> - bool has_ds_bpermute;
> + bool has_mov_dpp;
>
> bool is_gs_copy_shader;
> LLVMValueRef gs_next_vertex;
> @@ -1434,7 +1434,7 @@ static LLVMValueRef emit_ddxy(struct nir_to_llvm_context *ctx,
> LLVMValueRef result;
> ctx->has_ddxy = true;
>
> - if (!ctx->lds && !ctx->has_ds_bpermute)
> + if (!ctx->lds && !ctx->has_mov_dpp)
> ctx->lds = LLVMAddGlobalInAddressSpace(ctx->module,
> LLVMArrayType(ctx->i32, 64),
> "ddxy_lds", LOCAL_ADDR_SPACE);
> @@ -1454,7 +1454,7 @@ static LLVMValueRef emit_ddxy(struct nir_to_llvm_context *ctx,
> else
> idx = 2;
>
> - result = ac_build_ddxy(&ctx->ac, ctx->has_ds_bpermute,
> + result = ac_build_ddxy(&ctx->ac, ctx->has_mov_dpp,
> mask, idx, ctx->lds,
> src0);
> return result;
> @@ -5858,7 +5858,7 @@ LLVMModuleRef ac_translate_nir_to_llvm(LLVMTargetMachineRef tm,
> ac_llvm_context_init(&ctx.ac, ctx.context);
> ctx.ac.module = ctx.module;
>
> - ctx.has_ds_bpermute = ctx.options->chip_class >= VI;
> + ctx.has_mov_dpp = ctx.options->chip_class >= VI;
>
> memset(shader_info, 0, sizeof(*shader_info));
>
> diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
> index cb372267cde..7e83d5e5ac4 100644
> --- a/src/gallium/drivers/radeonsi/si_pipe.c
> +++ b/src/gallium/drivers/radeonsi/si_pipe.c
> @@ -944,7 +944,7 @@ struct pipe_screen *radeonsi_screen_create(struct radeon_winsys *ws)
> sscreen->b.info.pfp_fw_version >= 121 &&
> sscreen->b.info.me_fw_version >= 87);
>
> - sscreen->has_ds_bpermute = sscreen->b.chip_class >= VI;
> + sscreen->has_mov_dpp = sscreen->b.chip_class >= VI;
> sscreen->has_msaa_sample_loc_bug = (sscreen->b.family >= CHIP_POLARIS10 &&
> sscreen->b.family <= CHIP_POLARIS12) ||
> sscreen->b.family == CHIP_VEGA10 ||
> diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
> index 108929c10c6..ef139fb0cd3 100644
> --- a/src/gallium/drivers/radeonsi/si_pipe.h
> +++ b/src/gallium/drivers/radeonsi/si_pipe.h
> @@ -79,7 +79,7 @@ struct si_screen {
> unsigned tess_offchip_block_dw_size;
> bool has_distributed_tess;
> bool has_draw_indirect_multi;
> - bool has_ds_bpermute;
> + bool has_mov_dpp;
> bool has_msaa_sample_loc_bug;
>
> /* Whether shaders are monolithic (1-part) or separate (3-part). */
> diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
> index 2c92269a575..2eed45d79a5 100644
> --- a/src/gallium/drivers/radeonsi/si_shader.c
> +++ b/src/gallium/drivers/radeonsi/si_shader.c
> @@ -3442,7 +3442,7 @@ static void si_llvm_emit_ddxy(
> idx = (opcode == TGSI_OPCODE_DDX || opcode == TGSI_OPCODE_DDX_FINE) ? 1 : 2;
>
> val = LLVMBuildBitCast(gallivm->builder, emit_data->args[0], ctx->i32, "");
> - val = ac_build_ddxy(&ctx->ac, ctx->screen->has_ds_bpermute,
> + val = ac_build_ddxy(&ctx->ac, ctx->screen->has_mov_dpp,
> mask, idx, ctx->lds, val);
> emit_data->output[emit_data->chan] = val;
> }
> @@ -4454,7 +4454,7 @@ static void create_function(struct si_shader_context *ctx)
> assert(shader->info.num_input_vgprs >= num_prolog_vgprs);
> shader->info.num_input_vgprs -= num_prolog_vgprs;
>
> - if (!ctx->screen->has_ds_bpermute &&
> + if (!ctx->screen->has_mov_dpp &&
> bld_base->info &&
> (bld_base->info->opcode_count[TGSI_OPCODE_DDX] > 0 ||
> bld_base->info->opcode_count[TGSI_OPCODE_DDY] > 0 ||
> --
> 2.13.0
>
> _______________________________________________
> mesa-dev mailing list
> mesa-dev at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
More information about the mesa-dev
mailing list