[Mesa-dev] [PATCH] ac: Use mov_dpp for derivatives.

Marek Olšák maraeo at gmail.com
Sun Jun 11 14:37:14 UTC 2017


Hi Bas,

Have you tested piglit on radeonsi with this?

Marek

On Sat, Jun 10, 2017 at 10:05 PM, Bas Nieuwenhuizen
<bas at basnieuwenhuizen.nl> wrote:
> Slightly faster than bpermute, and seems supported since at least
> LLVM 3.9.
>
> v2: Since this supersedes bpermute, remove the bpermute code.
> Signed-off-by: Bas Nieuwenhuizen <basni at google.com>
> ---
>  src/amd/common/ac_llvm_build.c           | 47 ++++++++++++++++++++------------
>  src/amd/common/ac_llvm_build.h           |  2 +-
>  src/amd/common/ac_nir_to_llvm.c          |  8 +++---
>  src/gallium/drivers/radeonsi/si_pipe.c   |  2 +-
>  src/gallium/drivers/radeonsi/si_pipe.h   |  2 +-
>  src/gallium/drivers/radeonsi/si_shader.c |  4 +--
>  6 files changed, 38 insertions(+), 27 deletions(-)
>
> diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c
> index 237e9291d41..99d41bf52d6 100644
> --- a/src/amd/common/ac_llvm_build.c
> +++ b/src/amd/common/ac_llvm_build.c
> @@ -783,41 +783,52 @@ ac_get_thread_id(struct ac_llvm_context *ctx)
>   */
>  LLVMValueRef
>  ac_build_ddxy(struct ac_llvm_context *ctx,
> -             bool has_ds_bpermute,
> +             bool has_mov_dpp,
>               uint32_t mask,
>               int idx,
>               LLVMValueRef lds,
>               LLVMValueRef val)
>  {
> -       LLVMValueRef thread_id, tl, trbl, tl_tid, trbl_tid, args[2];
> +       LLVMValueRef thread_id, tl, trbl, args[5];
>         LLVMValueRef result;
>
> -       thread_id = ac_get_thread_id(ctx);
> +       if (has_mov_dpp) {
> +               uint32_t tl_ctrl = 0, trbl_ctrl = 0;
>
> -       tl_tid = LLVMBuildAnd(ctx->builder, thread_id,
> -                             LLVMConstInt(ctx->i32, mask, false), "");
> -
> -       trbl_tid = LLVMBuildAdd(ctx->builder, tl_tid,
> -                               LLVMConstInt(ctx->i32, idx, false), "");
> +               for (unsigned i = 0; i < 4; ++i) {
> +                       tl_ctrl |= (i & mask) << (2 * i);
> +                       trbl_ctrl |= ((i & mask) + idx) << (2 * i);
> +               }
>
> -       if (has_ds_bpermute) {
> -               args[0] = LLVMBuildMul(ctx->builder, tl_tid,
> -                                      LLVMConstInt(ctx->i32, 4, false), "");
> -               args[1] = val;
> +               args[0] = val;
> +               args[1] = LLVMConstInt(ctx->i32, tl_ctrl, false);
> +               args[2] = LLVMConstInt(ctx->i32, 0xf, false);
> +               args[3] = LLVMConstInt(ctx->i32, 0xf, false);
> +               args[4] = LLVMConstInt(ctx->i1, 1, false);
>                 tl = ac_build_intrinsic(ctx,
> -                                       "llvm.amdgcn.ds.bpermute", ctx->i32,
> -                                       args, 2,
> +                                       "llvm.amdgcn.mov.dpp.i32", ctx->i32,
> +                                       args, 5,
>                                         AC_FUNC_ATTR_READNONE |
>                                         AC_FUNC_ATTR_CONVERGENT);
>
> -               args[0] = LLVMBuildMul(ctx->builder, trbl_tid,
> -                                      LLVMConstInt(ctx->i32, 4, false), "");
> +               args[1] = LLVMConstInt(ctx->i32, trbl_ctrl, false);
>                 trbl = ac_build_intrinsic(ctx,
> -                                         "llvm.amdgcn.ds.bpermute", ctx->i32,
> -                                         args, 2,
> +                                         "llvm.amdgcn.mov.dpp.i32", ctx->i32,
> +                                         args, 5,
>                                           AC_FUNC_ATTR_READNONE |
>                                           AC_FUNC_ATTR_CONVERGENT);
>         } else {
> +               LLVMValueRef tl_tid, trbl_tid;
> +
> +               thread_id = ac_get_thread_id(ctx);
> +
> +               tl_tid = LLVMBuildAnd(ctx->builder, thread_id,
> +                               LLVMConstInt(ctx->i32, mask, false), "");
> +
> +               trbl_tid = LLVMBuildAdd(ctx->builder, tl_tid,
> +                                       LLVMConstInt(ctx->i32, idx, false), "");
> +
> +
>                 LLVMValueRef store_ptr, load_ptr0, load_ptr1;
>
>                 store_ptr = ac_build_gep0(ctx, lds, thread_id);
> diff --git a/src/amd/common/ac_llvm_build.h b/src/amd/common/ac_llvm_build.h
> index ebb78fbd79b..14260b05018 100644
> --- a/src/amd/common/ac_llvm_build.h
> +++ b/src/amd/common/ac_llvm_build.h
> @@ -161,7 +161,7 @@ ac_get_thread_id(struct ac_llvm_context *ctx);
>
>  LLVMValueRef
>  ac_build_ddxy(struct ac_llvm_context *ctx,
> -             bool has_ds_bpermute,
> +             bool has_mov_dpp,
>               uint32_t mask,
>               int idx,
>               LLVMValueRef lds,
> diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
> index 49117d21bd2..2385c60d316 100644
> --- a/src/amd/common/ac_nir_to_llvm.c
> +++ b/src/amd/common/ac_nir_to_llvm.c
> @@ -164,7 +164,7 @@ struct nir_to_llvm_context {
>         uint8_t num_output_clips;
>         uint8_t num_output_culls;
>
> -       bool has_ds_bpermute;
> +       bool has_mov_dpp;
>
>         bool is_gs_copy_shader;
>         LLVMValueRef gs_next_vertex;
> @@ -1434,7 +1434,7 @@ static LLVMValueRef emit_ddxy(struct nir_to_llvm_context *ctx,
>         LLVMValueRef result;
>         ctx->has_ddxy = true;
>
> -       if (!ctx->lds && !ctx->has_ds_bpermute)
> +       if (!ctx->lds && !ctx->has_mov_dpp)
>                 ctx->lds = LLVMAddGlobalInAddressSpace(ctx->module,
>                                                        LLVMArrayType(ctx->i32, 64),
>                                                        "ddxy_lds", LOCAL_ADDR_SPACE);
> @@ -1454,7 +1454,7 @@ static LLVMValueRef emit_ddxy(struct nir_to_llvm_context *ctx,
>         else
>                 idx = 2;
>
> -       result = ac_build_ddxy(&ctx->ac, ctx->has_ds_bpermute,
> +       result = ac_build_ddxy(&ctx->ac, ctx->has_mov_dpp,
>                               mask, idx, ctx->lds,
>                               src0);
>         return result;
> @@ -5858,7 +5858,7 @@ LLVMModuleRef ac_translate_nir_to_llvm(LLVMTargetMachineRef tm,
>         ac_llvm_context_init(&ctx.ac, ctx.context);
>         ctx.ac.module = ctx.module;
>
> -       ctx.has_ds_bpermute = ctx.options->chip_class >= VI;
> +       ctx.has_mov_dpp = ctx.options->chip_class >= VI;
>
>         memset(shader_info, 0, sizeof(*shader_info));
>
> diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
> index cb372267cde..7e83d5e5ac4 100644
> --- a/src/gallium/drivers/radeonsi/si_pipe.c
> +++ b/src/gallium/drivers/radeonsi/si_pipe.c
> @@ -944,7 +944,7 @@ struct pipe_screen *radeonsi_screen_create(struct radeon_winsys *ws)
>                  sscreen->b.info.pfp_fw_version >= 121 &&
>                  sscreen->b.info.me_fw_version >= 87);
>
> -       sscreen->has_ds_bpermute = sscreen->b.chip_class >= VI;
> +       sscreen->has_mov_dpp = sscreen->b.chip_class >= VI;
>         sscreen->has_msaa_sample_loc_bug = (sscreen->b.family >= CHIP_POLARIS10 &&
>                                             sscreen->b.family <= CHIP_POLARIS12) ||
>                                            sscreen->b.family == CHIP_VEGA10 ||
> diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
> index 108929c10c6..ef139fb0cd3 100644
> --- a/src/gallium/drivers/radeonsi/si_pipe.h
> +++ b/src/gallium/drivers/radeonsi/si_pipe.h
> @@ -79,7 +79,7 @@ struct si_screen {
>         unsigned                        tess_offchip_block_dw_size;
>         bool                            has_distributed_tess;
>         bool                            has_draw_indirect_multi;
> -       bool                            has_ds_bpermute;
> +       bool                            has_mov_dpp;
>         bool                            has_msaa_sample_loc_bug;
>
>         /* Whether shaders are monolithic (1-part) or separate (3-part). */
> diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
> index 2c92269a575..2eed45d79a5 100644
> --- a/src/gallium/drivers/radeonsi/si_shader.c
> +++ b/src/gallium/drivers/radeonsi/si_shader.c
> @@ -3442,7 +3442,7 @@ static void si_llvm_emit_ddxy(
>         idx = (opcode == TGSI_OPCODE_DDX || opcode == TGSI_OPCODE_DDX_FINE) ? 1 : 2;
>
>         val = LLVMBuildBitCast(gallivm->builder, emit_data->args[0], ctx->i32, "");
> -       val = ac_build_ddxy(&ctx->ac, ctx->screen->has_ds_bpermute,
> +       val = ac_build_ddxy(&ctx->ac, ctx->screen->has_mov_dpp,
>                             mask, idx, ctx->lds, val);
>         emit_data->output[emit_data->chan] = val;
>  }
> @@ -4454,7 +4454,7 @@ static void create_function(struct si_shader_context *ctx)
>         assert(shader->info.num_input_vgprs >= num_prolog_vgprs);
>         shader->info.num_input_vgprs -= num_prolog_vgprs;
>
> -       if (!ctx->screen->has_ds_bpermute &&
> +       if (!ctx->screen->has_mov_dpp &&
>             bld_base->info &&
>             (bld_base->info->opcode_count[TGSI_OPCODE_DDX] > 0 ||
>              bld_base->info->opcode_count[TGSI_OPCODE_DDY] > 0 ||
> --
> 2.13.0
>
> _______________________________________________
> mesa-dev mailing list
> mesa-dev at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev


More information about the mesa-dev mailing list