[Mesa-dev] [PATCH] ac: Use DPP for build_ddxy where possible.

Wed May 23 09:48:07 UTC 2018

WQM is pretty reliable now on LLVM 7, so let us just use
DPP + WQM.

This gives approximately a 1.5% performance increase on the
vrcompositor built-in benchmark.
---
 src/amd/common/ac_llvm_build.c | 243 ++++++++++++++++++---------------
 1 file changed, 130 insertions(+), 113 deletions(-)

diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c
index 36c1d62637b..f849f6461ce 100644
--- a/src/amd/common/ac_llvm_build.c
+++ b/src/amd/common/ac_llvm_build.c
@@ -1137,119 +1137,6 @@ ac_get_thread_id(struct ac_llvm_context *ctx)
 	return tid;
 }
 
-/*
- * SI implements derivatives using the local data store (LDS)
- * All writes to the LDS happen in all executing threads at
- * the same time. TID is the Thread ID for the current
- * thread and is a value between 0 and 63, representing
- * the thread's position in the wavefront.
- *
- * For the pixel shader threads are grouped into quads of four pixels.
- * The TIDs of the pixels of a quad are:
- *
- *  +------+------+
- *  |4n + 0|4n + 1|
- *  +------+------+
- *  |4n + 2|4n + 3|
- *  +------+------+
- *
- * So, masking the TID with 0xfffffffc yields the TID of the top left pixel
- * of the quad, masking with 0xfffffffd yields the TID of the top pixel of
- * the current pixel's column, and masking with 0xfffffffe yields the TID
- * of the left pixel of the current pixel's row.
- *
- * Adding 1 yields the TID of the pixel to the right of the left pixel, and
- * adding 2 yields the TID of the pixel below the top pixel.
- */
-LLVMValueRef
-ac_build_ddxy(struct ac_llvm_context *ctx,
-	      uint32_t mask,
-	      int idx,
-	      LLVMValueRef val)
-{
-	LLVMValueRef tl, trbl, args[2];
-	LLVMValueRef result;
-
-	if (ctx->chip_class >= VI) {
-		LLVMValueRef thread_id, tl_tid, trbl_tid;
-		thread_id = ac_get_thread_id(ctx);
-
-		tl_tid = LLVMBuildAnd(ctx->builder, thread_id,
-				      LLVMConstInt(ctx->i32, mask, false), "");
-
-		trbl_tid = LLVMBuildAdd(ctx->builder, tl_tid,
-					LLVMConstInt(ctx->i32, idx, false), "");
-
-		args[0] = LLVMBuildMul(ctx->builder, tl_tid,
-				       LLVMConstInt(ctx->i32, 4, false), "");
-		args[1] = val;
-		tl = ac_build_intrinsic(ctx,
-					"llvm.amdgcn.ds.bpermute", ctx->i32,
-					args, 2,
-					AC_FUNC_ATTR_READNONE |
-					AC_FUNC_ATTR_CONVERGENT);
-
-		args[0] = LLVMBuildMul(ctx->builder, trbl_tid,
-				       LLVMConstInt(ctx->i32, 4, false), "");
-		trbl = ac_build_intrinsic(ctx,
-					  "llvm.amdgcn.ds.bpermute", ctx->i32,
-					  args, 2,
-					  AC_FUNC_ATTR_READNONE |
-					  AC_FUNC_ATTR_CONVERGENT);
-	} else {
-		uint32_t masks[2] = {};
-
-		switch (mask) {
-		case AC_TID_MASK_TOP_LEFT:
-			masks[0] = 0x8000;
-			if (idx == 1)
-				masks[1] = 0x8055;
-			else
-				masks[1] = 0x80aa;
-
-			break;
-		case AC_TID_MASK_TOP:
-			masks[0] = 0x8044;
-			masks[1] = 0x80ee;
-			break;
-		case AC_TID_MASK_LEFT:
-			masks[0] = 0x80a0;
-			masks[1] = 0x80f5;
-			break;
-		default:
-			assert(0);
-		}
-
-		args[0] = val;
-		args[1] = LLVMConstInt(ctx->i32, masks[0], false);
-
-		tl = ac_build_intrinsic(ctx,
-					"llvm.amdgcn.ds.swizzle", ctx->i32,
-					args, 2,
-					AC_FUNC_ATTR_READNONE |
-					AC_FUNC_ATTR_CONVERGENT);
-
-		args[1] = LLVMConstInt(ctx->i32, masks[1], false);
-		trbl = ac_build_intrinsic(ctx,
-					"llvm.amdgcn.ds.swizzle", ctx->i32,
-					args, 2,
-					AC_FUNC_ATTR_READNONE |
-					AC_FUNC_ATTR_CONVERGENT);
-	}
-
-	tl = LLVMBuildBitCast(ctx->builder, tl, ctx->f32, "");
-	trbl = LLVMBuildBitCast(ctx->builder, trbl, ctx->f32, "");
-	result = LLVMBuildFSub(ctx->builder, trbl, tl, "");
-
-	if (HAVE_LLVM >= 0x0700) {
-		result = ac_build_intrinsic(ctx,
-			"llvm.amdgcn.wqm.f32", ctx->f32,
-			&result, 1, 0);
-	}
-
-	return result;
-}
-
 void
 ac_build_sendmsg(struct ac_llvm_context *ctx,
 		 uint32_t msg,
@@ -2764,6 +2651,136 @@ ac_build_dpp(struct ac_llvm_context *ctx, LLVMValueRef old, LLVMValueRef src,
 	return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
 }
 
+/*
+ * SI implements derivatives using the local data store (LDS)
+ * All writes to the LDS happen in all executing threads at
+ * the same time. TID is the Thread ID for the current
+ * thread and is a value between 0 and 63, representing
+ * the thread's position in the wavefront.
+ *
+ * For the pixel shader threads are grouped into quads of four pixels.
+ * The TIDs of the pixels of a quad are:
+ *
+ *  +------+------+
+ *  |4n + 0|4n + 1|
+ *  +------+------+
+ *  |4n + 2|4n + 3|
+ *  +------+------+
+ *
+ * So, masking the TID with 0xfffffffc yields the TID of the top left pixel
+ * of the quad, masking with 0xfffffffd yields the TID of the top pixel of
+ * the current pixel's column, and masking with 0xfffffffe yields the TID
+ * of the left pixel of the current pixel's row.
+ *
+ * Adding 1 yields the TID of the pixel to the right of the left pixel, and
+ * adding 2 yields the TID of the pixel below the top pixel.
+ */
+LLVMValueRef
+ac_build_ddxy(struct ac_llvm_context *ctx,
+	      uint32_t mask,
+	      int idx,
+	      LLVMValueRef val)
+{
+	LLVMValueRef tl, trbl, args[2];
+	LLVMValueRef result;
+
+	if (ctx->chip_class >= VI && HAVE_LLVM >= 0x0700) {
+		LLVMValueRef zero = ctx->i32_0;
+		unsigned tl_lanes[4], trbl_lanes[4];
+
+		for (unsigned i = 0; i < 4; ++i) {
+			tl_lanes[i] = i & mask;
+			trbl_lanes[i] = (i & mask) + idx;
+		}
+
+                tl = ac_build_dpp(ctx, zero, val,
+		                  dpp_quad_perm(tl_lanes[0], tl_lanes[1],
+		                                tl_lanes[2], tl_lanes[3]),
+		                  0xf, 0xf, false);
+		trbl = ac_build_dpp(ctx, zero, val,
+		                    dpp_quad_perm(trbl_lanes[0], trbl_lanes[1],
+		                                  trbl_lanes[2], trbl_lanes[3]),
+		                    0xf, 0xf, false);
+	} else if (ctx->chip_class >= VI) {
+		LLVMValueRef thread_id, tl_tid, trbl_tid;
+		thread_id = ac_get_thread_id(ctx);
+
+		tl_tid = LLVMBuildAnd(ctx->builder, thread_id,
+				      LLVMConstInt(ctx->i32, mask, false), "");
+
+		trbl_tid = LLVMBuildAdd(ctx->builder, tl_tid,
+					LLVMConstInt(ctx->i32, idx, false), "");
+
+		args[0] = LLVMBuildMul(ctx->builder, tl_tid,
+				       LLVMConstInt(ctx->i32, 4, false), "");
+		args[1] = val;
+		tl = ac_build_intrinsic(ctx,
+					"llvm.amdgcn.ds.bpermute", ctx->i32,
+					args, 2,
+					AC_FUNC_ATTR_READNONE |
+					AC_FUNC_ATTR_CONVERGENT);
+
+		args[0] = LLVMBuildMul(ctx->builder, trbl_tid,
+				       LLVMConstInt(ctx->i32, 4, false), "");
+		trbl = ac_build_intrinsic(ctx,
+					  "llvm.amdgcn.ds.bpermute", ctx->i32,
+					  args, 2,
+					  AC_FUNC_ATTR_READNONE |
+					  AC_FUNC_ATTR_CONVERGENT);
+	} else {
+		uint32_t masks[2] = {};
+
+		switch (mask) {
+		case AC_TID_MASK_TOP_LEFT:
+			masks[0] = 0x8000;
+			if (idx == 1)
+				masks[1] = 0x8055;
+			else
+				masks[1] = 0x80aa;
+
+			break;
+		case AC_TID_MASK_TOP:
+			masks[0] = 0x8044;
+			masks[1] = 0x80ee;
+			break;
+		case AC_TID_MASK_LEFT:
+			masks[0] = 0x80a0;
+			masks[1] = 0x80f5;
+			break;
+		default:
+			assert(0);
+		}
+
+		args[0] = val;
+		args[1] = LLVMConstInt(ctx->i32, masks[0], false);
+
+		tl = ac_build_intrinsic(ctx,
+					"llvm.amdgcn.ds.swizzle", ctx->i32,
+					args, 2,
+					AC_FUNC_ATTR_READNONE |
+					AC_FUNC_ATTR_CONVERGENT);
+
+		args[1] = LLVMConstInt(ctx->i32, masks[1], false);
+		trbl = ac_build_intrinsic(ctx,
+					"llvm.amdgcn.ds.swizzle", ctx->i32,
+					args, 2,
+					AC_FUNC_ATTR_READNONE |
+					AC_FUNC_ATTR_CONVERGENT);
+	}
+
+	tl = LLVMBuildBitCast(ctx->builder, tl, ctx->f32, "");
+	trbl = LLVMBuildBitCast(ctx->builder, trbl, ctx->f32, "");
+	result = LLVMBuildFSub(ctx->builder, trbl, tl, "");
+
+	if (HAVE_LLVM >= 0x0700) {
+		result = ac_build_intrinsic(ctx,
+			"llvm.amdgcn.wqm.f32", ctx->f32,
+			&result, 1, 0);
+	}
+
+	return result;
+}
+
 static inline unsigned
 ds_pattern_bitmode(unsigned and_mask, unsigned or_mask, unsigned xor_mask)
 {
-- 
2.17.0