[Mesa-dev] [PATCH 4/4] radeonsi: Implement ddx/ddy on VI using ds_bpermute

Marek Olšák maraeo at gmail.com
Wed Apr 20 11:27:41 UTC 2016


Patches 1-3:
Reviewed-by: Marek Olšák <marek.olsak at amd.com>

Patch 4:
Acked-by: Marek Olšák <marek.olsak at amd.com>

Marek


On Tue, Apr 19, 2016 at 7:52 PM, Tom Stellard <thomas.stellard at amd.com> wrote:
> The ds_bpermute instruction allows threads to transfer data directly
> to or from the vgprs of other threads.  These instructions use the lds
> hardware to transfer data, but do not read or write lds memory.
>
> DDX BEFORE:                        |  DDX AFTER:
>                                    |
> v_mbcnt_lo_u32_b32_e64 v2, -1, 0   |  v_mbcnt_lo_u32_b32_e64 v2, -1, 0
> v_mbcnt_hi_u32_b32_e64 v2, -1, v2  |  v_mbcnt_hi_u32_b32_e64 v2, -1, v2
> v_lshlrev_b32_e32 v4, 2, v2        |  v_and_b32_e32 v2, 60, v2
> v_and_b32_e32 v2, 60, v2           |  v_lshlrev_b32_e32 v2, 2, v2
> v_lshlrev_b32_e32 v3, 2, v2        |  ds_bpermute_b32 v3, v2, v0
> s_mov_b32 m0, -1                   |  ds_bpermute_b32 v0, v2, v0 offset:4
> ds_write_b32 v4, v0                |  s_waitcnt lgkmcnt(0)
> s_waitcnt lgkmcnt(0)               |
> v_or_b32_e32 v0, 1, v2             |
> v_lshlrev_b32_e32 v0, 2, v0        |
> ds_read_b32 v1, v3                 |
> ds_read_b32 v0, v0                 |
> s_waitcnt lgkmcnt(0)               |
>                                    |
> LDS: 1 blocks                      |  LDS: 0 blocks
> ---
>  src/gallium/drivers/radeonsi/si_shader.c | 42 +++++++++++++++++++++++---------
>  1 file changed, 30 insertions(+), 12 deletions(-)
>
> diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
> index 2a747f9..d3e445b 100644
> --- a/src/gallium/drivers/radeonsi/si_shader.c
> +++ b/src/gallium/drivers/radeonsi/si_shader.c
> @@ -4162,6 +4162,7 @@ static void si_llvm_emit_ddxy(
>         LLVMValueRef indices[2];
>         LLVMValueRef store_ptr, load_ptr0, load_ptr1;
>         LLVMValueRef tl, trbl, result[4];
> +       LLVMValueRef tl_tid, trbl_tid;
>         unsigned swizzle[4];
>         unsigned c;
>         int idx;
> @@ -4179,20 +4180,24 @@ static void si_llvm_emit_ddxy(
>         else
>                 mask = TID_MASK_TOP_LEFT;
>
> -       indices[1] = LLVMBuildAnd(gallivm->builder, indices[1],
> -                                 lp_build_const_int32(gallivm, mask), "");
> +       tl_tid = LLVMBuildAnd(gallivm->builder, indices[1],
> +                               lp_build_const_int32(gallivm, mask), "");
> +       indices[1] = tl_tid;
>         load_ptr0 = LLVMBuildGEP(gallivm->builder, ctx->lds,
>                                  indices, 2, "");
>
>         /* for DDX we want to next X pixel, DDY next Y pixel. */
>         idx = (opcode == TGSI_OPCODE_DDX || opcode == TGSI_OPCODE_DDX_FINE) ? 1 : 2;
> -       indices[1] = LLVMBuildAdd(gallivm->builder, indices[1],
> +       trbl_tid = LLVMBuildAdd(gallivm->builder, indices[1],
>                                   lp_build_const_int32(gallivm, idx), "");
> +       indices[1] = trbl_tid;
>         load_ptr1 = LLVMBuildGEP(gallivm->builder, ctx->lds,
>                                  indices, 2, "");
>
>         for (c = 0; c < 4; ++c) {
>                 unsigned i;
> +               LLVMValueRef val;
> +               LLVMValueRef args[2];
>
>                 swizzle[c] = tgsi_util_get_full_src_register_swizzle(&inst->Src[0], c);
>                 for (i = 0; i < c; ++i) {
> @@ -4204,18 +4209,31 @@ static void si_llvm_emit_ddxy(
>                 if (i != c)
>                         continue;
>
> -               LLVMBuildStore(gallivm->builder,
> -                              LLVMBuildBitCast(gallivm->builder,
> -                                               lp_build_emit_fetch(bld_base, inst, 0, c),
> -                                               ctx->i32, ""),
> -                              store_ptr);
> +               val = LLVMBuildBitCast(gallivm->builder,
> +                               lp_build_emit_fetch(bld_base, inst, 0, c),
> +                                               ctx->i32, "");
>
> -               tl = LLVMBuildLoad(gallivm->builder, load_ptr0, "");
> -               tl = LLVMBuildBitCast(gallivm->builder, tl, ctx->f32, "");
> +               if ((HAVE_LLVM >= 0x0309) && ctx->screen->b.family >= CHIP_TONGA) {
>
> -               trbl = LLVMBuildLoad(gallivm->builder, load_ptr1, "");
> -               trbl = LLVMBuildBitCast(gallivm->builder, trbl, ctx->f32, "");
> +                       args[0] = LLVMBuildMul(gallivm->builder, tl_tid,
> +                                        lp_build_const_int32(gallivm, 4), "");
> +                       args[1] = val;
> +                       tl = lp_build_intrinsic(gallivm->builder,
> +                                       "llvm.amdgcn.ds.bpermute", ctx->i32,
> +                                       args, 2, LLVMReadNoneAttribute);
>
> +                       args[0] = LLVMBuildMul(gallivm->builder, trbl_tid,
> +                                        lp_build_const_int32(gallivm, 4), "");
> +                       trbl = lp_build_intrinsic(gallivm->builder,
> +                                       "llvm.amdgcn.ds.bpermute", ctx->i32,
> +                                       args, 2, LLVMReadNoneAttribute);
> +               } else {
> +                       LLVMBuildStore(gallivm->builder, val, store_ptr);
> +                       tl = LLVMBuildLoad(gallivm->builder, load_ptr0, "");
> +                       trbl = LLVMBuildLoad(gallivm->builder, load_ptr1, "");
> +               }
> +               tl = LLVMBuildBitCast(gallivm->builder, tl, ctx->f32, "");
> +               trbl = LLVMBuildBitCast(gallivm->builder, trbl, ctx->f32, "");
>                 result[c] = LLVMBuildFSub(gallivm->builder, trbl, tl, "");
>         }
>
> --
> 2.1.0
>
> _______________________________________________
> mesa-dev mailing list
> mesa-dev at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev


More information about the mesa-dev mailing list