[Mesa-dev] [PATCH 2/6] gallivm: optimize SoA AoS fallback fetch path a little
sroland at vmware.com
sroland at vmware.com
Mon Dec 12 00:11:58 UTC 2016
From: Roland Scheidegger <sroland at vmware.com>
We should do transpose, not extract/insert, at least with "sufficient" amount
of channels (for 4 channels, extract/insert shuffles generated otherwise look
truly terrifying). Albeit we shouldn't fallback to that so often in any case.
---
src/gallium/auxiliary/gallivm/lp_bld_format_soa.c | 83 +++++++++++++++++++----
1 file changed, 70 insertions(+), 13 deletions(-)
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
index 389bfa0..902c763 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
@@ -40,6 +40,39 @@
#include "lp_bld_debug.h"
#include "lp_bld_format.h"
#include "lp_bld_arit.h"
+#include "lp_bld_pack.h"
+
+
+static void
+convert_to_soa(struct gallivm_state *gallivm,
+ LLVMValueRef src_aos[LP_MAX_VECTOR_WIDTH / 32],
+ LLVMValueRef dst_soa[4],
+ const struct lp_type soa_type)
+{
+ unsigned j, k;
+ struct lp_type aos_channel_type = soa_type;
+
+ LLVMValueRef aos_channels[4];
+ unsigned pixels_per_channel = soa_type.length / 4;
+
+ debug_assert((soa_type.length % 4) == 0);
+
+ aos_channel_type.length >>= 1;
+
+ for (j = 0; j < 4; ++j) {
+ LLVMValueRef channel[LP_MAX_VECTOR_LENGTH] = { 0 };
+
+ assert(pixels_per_channel <= LP_MAX_VECTOR_LENGTH);
+
+ for (k = 0; k < pixels_per_channel; ++k) {
+ channel[k] = src_aos[j + 4 * k];
+ }
+
+ aos_channels[j] = lp_build_concat(gallivm, channel, aos_channel_type, pixels_per_channel);
+ }
+
+ lp_build_transpose_aos(gallivm, soa_type, aos_channels, dst_soa);
+}
void
@@ -48,9 +81,6 @@ lp_build_format_swizzle_soa(const struct util_format_description *format_desc,
const LLVMValueRef *unswizzled,
LLVMValueRef swizzled_out[4])
{
- assert(PIPE_SWIZZLE_0 == (int)PIPE_SWIZZLE_0);
- assert(PIPE_SWIZZLE_1 == (int)PIPE_SWIZZLE_1);
-
if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) {
enum pipe_swizzle swizzle;
LLVMValueRef depth_or_stencil;
@@ -547,9 +577,11 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
{
unsigned k, chan;
struct lp_type tmp_type;
+ LLVMValueRef aos_fetch[LP_MAX_VECTOR_WIDTH / 32];
+ boolean vec_transpose = FALSE;
if (gallivm_debug & GALLIVM_DEBUG_PERF) {
- debug_printf("%s: scalar unpacking of %s\n",
+ debug_printf("%s: AoS fetch fallback for %s\n",
__FUNCTION__, format_desc->short_name);
}
@@ -560,12 +592,31 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
rgba_out[chan] = lp_build_undef(gallivm, type);
}
+ if (format_desc->nr_channels > 2 ||
+ format_desc->layout != UTIL_FORMAT_LAYOUT_PLAIN) {
+ /*
+ * Note that vector transpose can be worse. This is because
+ * llvm will ensure the missing channels have the correct
+ * values, in particular typically 1.0 for the last channel
+ * (if they are used or not doesn't matter, usually llvm can't
+ * figure this out here probably due to the transpose).
+ * But with the extract/insert path, since those missing elements
+ * were just directly inserted/extracted llvm can optimize this
+ * somewhat (though it still doesn't look great - and not for
+ * the compressed formats due to their external fetch funcs).
+ * So restrict to cases where we are sure it helps (albeit
+ * with 2 channels it MIGHT be worth it at least with AVX).
+ * In any case, this is just a bandaid, it does NOT replace proper
+ * SoA format unpack.
+ */
+ vec_transpose = TRUE;
+ }
+
/* loop over number of pixels */
for(k = 0; k < type.length; ++k) {
LLVMValueRef index = lp_build_const_int32(gallivm, k);
LLVMValueRef offset_elem;
LLVMValueRef i_elem, j_elem;
- LLVMValueRef tmp;
offset_elem = LLVMBuildExtractElement(builder, offset,
index, "");
@@ -574,20 +625,26 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
j_elem = LLVMBuildExtractElement(builder, j, index, "");
/* Get a single float[4]={R,G,B,A} pixel */
- tmp = lp_build_fetch_rgba_aos(gallivm, format_desc, tmp_type,
- aligned, base_ptr, offset_elem,
- i_elem, j_elem, cache);
+ aos_fetch[k] = lp_build_fetch_rgba_aos(gallivm, format_desc, tmp_type,
+ aligned, base_ptr, offset_elem,
+ i_elem, j_elem, cache);
/*
* Insert the AoS tmp value channels into the SoA result vectors at
* position = 'index'.
*/
- for (chan = 0; chan < 4; ++chan) {
- LLVMValueRef chan_val = lp_build_const_int32(gallivm, chan),
- tmp_chan = LLVMBuildExtractElement(builder, tmp, chan_val, "");
- rgba_out[chan] = LLVMBuildInsertElement(builder, rgba_out[chan],
- tmp_chan, index, "");
+ if (!vec_transpose) {
+ for (chan = 0; chan < 4; ++chan) {
+ LLVMValueRef chan_val = lp_build_const_int32(gallivm, chan),
+ tmp_chan = LLVMBuildExtractElement(builder, aos_fetch[k], chan_val, "");
+ rgba_out[chan] = LLVMBuildInsertElement(builder, rgba_out[chan],
+ tmp_chan, index, "");
+ }
}
}
+ if (vec_transpose) {
+ convert_to_soa(gallivm, aos_fetch, rgba_out, type);
+ }
+
}
}
--
2.7.4
More information about the mesa-dev
mailing list