[Mesa-dev] [PATCH] ac: Use DPP for build_ddxy where possible.
Bas Nieuwenhuizen
bas at basnieuwenhuizen.nl
Wed May 23 09:48:07 UTC 2018
WQM is pretty reliable now on LLVM 7, so let us just use
DPP + WQM.
This gives approximately a 1.5% performance increase on the
vrcompositor built-in benchmark.
---
src/amd/common/ac_llvm_build.c | 243 ++++++++++++++++++---------------
1 file changed, 130 insertions(+), 113 deletions(-)
diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c
index 36c1d62637b..f849f6461ce 100644
--- a/src/amd/common/ac_llvm_build.c
+++ b/src/amd/common/ac_llvm_build.c
@@ -1137,119 +1137,6 @@ ac_get_thread_id(struct ac_llvm_context *ctx)
return tid;
}
-/*
- * SI implements derivatives using the local data store (LDS)
- * All writes to the LDS happen in all executing threads at
- * the same time. TID is the Thread ID for the current
- * thread and is a value between 0 and 63, representing
- * the thread's position in the wavefront.
- *
- * For the pixel shader threads are grouped into quads of four pixels.
- * The TIDs of the pixels of a quad are:
- *
- * +------+------+
- * |4n + 0|4n + 1|
- * +------+------+
- * |4n + 2|4n + 3|
- * +------+------+
- *
- * So, masking the TID with 0xfffffffc yields the TID of the top left pixel
- * of the quad, masking with 0xfffffffd yields the TID of the top pixel of
- * the current pixel's column, and masking with 0xfffffffe yields the TID
- * of the left pixel of the current pixel's row.
- *
- * Adding 1 yields the TID of the pixel to the right of the left pixel, and
- * adding 2 yields the TID of the pixel below the top pixel.
- */
-LLVMValueRef
-ac_build_ddxy(struct ac_llvm_context *ctx,
- uint32_t mask,
- int idx,
- LLVMValueRef val)
-{
- LLVMValueRef tl, trbl, args[2];
- LLVMValueRef result;
-
- if (ctx->chip_class >= VI) {
- LLVMValueRef thread_id, tl_tid, trbl_tid;
- thread_id = ac_get_thread_id(ctx);
-
- tl_tid = LLVMBuildAnd(ctx->builder, thread_id,
- LLVMConstInt(ctx->i32, mask, false), "");
-
- trbl_tid = LLVMBuildAdd(ctx->builder, tl_tid,
- LLVMConstInt(ctx->i32, idx, false), "");
-
- args[0] = LLVMBuildMul(ctx->builder, tl_tid,
- LLVMConstInt(ctx->i32, 4, false), "");
- args[1] = val;
- tl = ac_build_intrinsic(ctx,
- "llvm.amdgcn.ds.bpermute", ctx->i32,
- args, 2,
- AC_FUNC_ATTR_READNONE |
- AC_FUNC_ATTR_CONVERGENT);
-
- args[0] = LLVMBuildMul(ctx->builder, trbl_tid,
- LLVMConstInt(ctx->i32, 4, false), "");
- trbl = ac_build_intrinsic(ctx,
- "llvm.amdgcn.ds.bpermute", ctx->i32,
- args, 2,
- AC_FUNC_ATTR_READNONE |
- AC_FUNC_ATTR_CONVERGENT);
- } else {
- uint32_t masks[2] = {};
-
- switch (mask) {
- case AC_TID_MASK_TOP_LEFT:
- masks[0] = 0x8000;
- if (idx == 1)
- masks[1] = 0x8055;
- else
- masks[1] = 0x80aa;
-
- break;
- case AC_TID_MASK_TOP:
- masks[0] = 0x8044;
- masks[1] = 0x80ee;
- break;
- case AC_TID_MASK_LEFT:
- masks[0] = 0x80a0;
- masks[1] = 0x80f5;
- break;
- default:
- assert(0);
- }
-
- args[0] = val;
- args[1] = LLVMConstInt(ctx->i32, masks[0], false);
-
- tl = ac_build_intrinsic(ctx,
- "llvm.amdgcn.ds.swizzle", ctx->i32,
- args, 2,
- AC_FUNC_ATTR_READNONE |
- AC_FUNC_ATTR_CONVERGENT);
-
- args[1] = LLVMConstInt(ctx->i32, masks[1], false);
- trbl = ac_build_intrinsic(ctx,
- "llvm.amdgcn.ds.swizzle", ctx->i32,
- args, 2,
- AC_FUNC_ATTR_READNONE |
- AC_FUNC_ATTR_CONVERGENT);
- }
-
- tl = LLVMBuildBitCast(ctx->builder, tl, ctx->f32, "");
- trbl = LLVMBuildBitCast(ctx->builder, trbl, ctx->f32, "");
- result = LLVMBuildFSub(ctx->builder, trbl, tl, "");
-
- if (HAVE_LLVM >= 0x0700) {
- result = ac_build_intrinsic(ctx,
- "llvm.amdgcn.wqm.f32", ctx->f32,
- &result, 1, 0);
- }
-
- return result;
-}
-
void
ac_build_sendmsg(struct ac_llvm_context *ctx,
uint32_t msg,
@@ -2764,6 +2651,136 @@ ac_build_dpp(struct ac_llvm_context *ctx, LLVMValueRef old, LLVMValueRef src,
return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
}
+/*
+ * SI implements derivatives using the local data store (LDS)
+ * All writes to the LDS happen in all executing threads at
+ * the same time. TID is the Thread ID for the current
+ * thread and is a value between 0 and 63, representing
+ * the thread's position in the wavefront.
+ *
+ * For the pixel shader threads are grouped into quads of four pixels.
+ * The TIDs of the pixels of a quad are:
+ *
+ * +------+------+
+ * |4n + 0|4n + 1|
+ * +------+------+
+ * |4n + 2|4n + 3|
+ * +------+------+
+ *
+ * So, masking the TID with 0xfffffffc yields the TID of the top left pixel
+ * of the quad, masking with 0xfffffffd yields the TID of the top pixel of
+ * the current pixel's column, and masking with 0xfffffffe yields the TID
+ * of the left pixel of the current pixel's row.
+ *
+ * Adding 1 yields the TID of the pixel to the right of the left pixel, and
+ * adding 2 yields the TID of the pixel below the top pixel.
+ */
+LLVMValueRef
+ac_build_ddxy(struct ac_llvm_context *ctx,
+ uint32_t mask,
+ int idx,
+ LLVMValueRef val)
+{
+ LLVMValueRef tl, trbl, args[2];
+ LLVMValueRef result;
+
+ if (ctx->chip_class >= VI && HAVE_LLVM >= 0x0700) {
+ LLVMValueRef zero = ctx->i32_0;
+ unsigned tl_lanes[4], trbl_lanes[4];
+
+ for (unsigned i = 0; i < 4; ++i) {
+ tl_lanes[i] = i & mask;
+ trbl_lanes[i] = (i & mask) + idx;
+ }
+
+ tl = ac_build_dpp(ctx, zero, val,
+ dpp_quad_perm(tl_lanes[0], tl_lanes[1],
+ tl_lanes[2], tl_lanes[3]),
+ 0xf, 0xf, false);
+ trbl = ac_build_dpp(ctx, zero, val,
+ dpp_quad_perm(trbl_lanes[0], trbl_lanes[1],
+ trbl_lanes[2], trbl_lanes[3]),
+ 0xf, 0xf, false);
+ } else if (ctx->chip_class >= VI) {
+ LLVMValueRef thread_id, tl_tid, trbl_tid;
+ thread_id = ac_get_thread_id(ctx);
+
+ tl_tid = LLVMBuildAnd(ctx->builder, thread_id,
+ LLVMConstInt(ctx->i32, mask, false), "");
+
+ trbl_tid = LLVMBuildAdd(ctx->builder, tl_tid,
+ LLVMConstInt(ctx->i32, idx, false), "");
+
+ args[0] = LLVMBuildMul(ctx->builder, tl_tid,
+ LLVMConstInt(ctx->i32, 4, false), "");
+ args[1] = val;
+ tl = ac_build_intrinsic(ctx,
+ "llvm.amdgcn.ds.bpermute", ctx->i32,
+ args, 2,
+ AC_FUNC_ATTR_READNONE |
+ AC_FUNC_ATTR_CONVERGENT);
+
+ args[0] = LLVMBuildMul(ctx->builder, trbl_tid,
+ LLVMConstInt(ctx->i32, 4, false), "");
+ trbl = ac_build_intrinsic(ctx,
+ "llvm.amdgcn.ds.bpermute", ctx->i32,
+ args, 2,
+ AC_FUNC_ATTR_READNONE |
+ AC_FUNC_ATTR_CONVERGENT);
+ } else {
+ uint32_t masks[2] = {};
+
+ switch (mask) {
+ case AC_TID_MASK_TOP_LEFT:
+ masks[0] = 0x8000;
+ if (idx == 1)
+ masks[1] = 0x8055;
+ else
+ masks[1] = 0x80aa;
+
+ break;
+ case AC_TID_MASK_TOP:
+ masks[0] = 0x8044;
+ masks[1] = 0x80ee;
+ break;
+ case AC_TID_MASK_LEFT:
+ masks[0] = 0x80a0;
+ masks[1] = 0x80f5;
+ break;
+ default:
+ assert(0);
+ }
+
+ args[0] = val;
+ args[1] = LLVMConstInt(ctx->i32, masks[0], false);
+
+ tl = ac_build_intrinsic(ctx,
+ "llvm.amdgcn.ds.swizzle", ctx->i32,
+ args, 2,
+ AC_FUNC_ATTR_READNONE |
+ AC_FUNC_ATTR_CONVERGENT);
+
+ args[1] = LLVMConstInt(ctx->i32, masks[1], false);
+ trbl = ac_build_intrinsic(ctx,
+ "llvm.amdgcn.ds.swizzle", ctx->i32,
+ args, 2,
+ AC_FUNC_ATTR_READNONE |
+ AC_FUNC_ATTR_CONVERGENT);
+ }
+
+ tl = LLVMBuildBitCast(ctx->builder, tl, ctx->f32, "");
+ trbl = LLVMBuildBitCast(ctx->builder, trbl, ctx->f32, "");
+ result = LLVMBuildFSub(ctx->builder, trbl, tl, "");
+
+ if (HAVE_LLVM >= 0x0700) {
+ result = ac_build_intrinsic(ctx,
+ "llvm.amdgcn.wqm.f32", ctx->f32,
+ &result, 1, 0);
+ }
+
+ return result;
+}
+
static inline unsigned
ds_pattern_bitmode(unsigned and_mask, unsigned or_mask, unsigned xor_mask)
{
--
2.17.0
More information about the mesa-dev
mailing list