Mesa (master): gallivm: optimize yuv decoding

Sat Sep 25 11:40:50 UTC 2010

Module: Mesa
Branch: master
Commit: 049a8cce76b1148ab00de1ae207171e519bfafca
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=049a8cce76b1148ab00de1ae207171e519bfafca

Author: Roland Scheidegger <sroland at vmware.com>
Date:   Fri Sep 24 15:17:07 2010 +0200

gallivm: optimize yuv decoding

this is more a proof to show vector shifts on x86 with per-element shift count
are evil. Since we can avoid the shift with a single compare/select, use that
instead. Replaces more than 20 instructions (and slow ones at that) with about 3,
and cuts compiled shader size with mesa's yuvsqure demo by over 10%
(no performance measurements done - but selection is blazing fast).
Might want to revisit that for future cpus - unfortunately AVX won't have vector
shifts neither, but AMD's XOP will, but even in that case using selection here
is probably not slower.

---

 src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c |   64 ++++++++++++++++++---
 1 files changed, 55 insertions(+), 9 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c b/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c
index 0a5038b..2bce289 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c
@@ -35,6 +35,7 @@
 
 
 #include "util/u_format.h"
+#include "util/u_cpu_detect.h"
 
 #include "lp_bld_arit.h"
 #include "lp_bld_type.h"
@@ -42,7 +43,7 @@
 #include "lp_bld_conv.h"
 #include "lp_bld_gather.h"
 #include "lp_bld_format.h"
-
+#include "lp_bld_logic.h"
 
 /**
  * Extract Y, U, V channels from packed UYVY.
@@ -59,7 +60,7 @@ uyvy_to_yuv_soa(LLVMBuilderRef builder,
                 LLVMValueRef *v)
 {
    struct lp_type type;
-   LLVMValueRef shift, mask;
+   LLVMValueRef mask;
 
    memset(&type, 0, sizeof type);
    type.width = 32;
@@ -69,14 +70,37 @@ uyvy_to_yuv_soa(LLVMBuilderRef builder,
    assert(lp_check_value(type, i));
 
    /*
-    * y = (uyvy >> 16*i) & 0xff
+    * y = (uyvy >> (16*i + 8)) & 0xff
     * u = (uyvy        ) & 0xff
     * v = (uyvy >> 16  ) & 0xff
     */
 
-   shift = LLVMBuildMul(builder, i, lp_build_const_int_vec(type, 16), "");
-   shift = LLVMBuildAdd(builder, shift, lp_build_const_int_vec(type, 8), "");
-   *y = LLVMBuildLShr(builder, packed, shift, "");
+#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
+   /*
+    * Avoid shift with per-element count.
+    * No support on x86, gets translated to roughly 5 instructions
+    * per element. Didn't measure performance but cuts shader size
+    * by quite a bit (less difference if cpu has no sse4.1 support).
+    */
+   if (util_cpu_caps.has_sse2 && n == 4) {
+      LLVMValueRef sel, tmp, tmp2;
+      struct lp_build_context bld32;
+
+      lp_build_context_init(&bld32, builder, type);
+
+      tmp = LLVMBuildLShr(builder, packed, lp_build_const_int_vec(type, 8), "");
+      tmp2 = LLVMBuildLShr(builder, tmp, lp_build_const_int_vec(type, 16), "");
+      sel = lp_build_compare(builder, type, PIPE_FUNC_EQUAL, i, lp_build_const_int_vec(type, 0));
+      *y = lp_build_select(&bld32, sel, tmp, tmp2);
+   } else
+#endif
+   {
+      LLVMValueRef shift;
+      shift = LLVMBuildMul(builder, i, lp_build_const_int_vec(type, 16), "");
+      shift = LLVMBuildAdd(builder, shift, lp_build_const_int_vec(type, 8), "");
+      *y = LLVMBuildLShr(builder, packed, shift, "");
+   }
+
    *u = packed;
    *v = LLVMBuildLShr(builder, packed, lp_build_const_int_vec(type, 16), "");
 
@@ -103,7 +127,7 @@ yuyv_to_yuv_soa(LLVMBuilderRef builder,
                 LLVMValueRef *v)
 {
    struct lp_type type;
-   LLVMValueRef shift, mask;
+   LLVMValueRef mask;
 
    memset(&type, 0, sizeof type);
    type.width = 32;
@@ -118,8 +142,30 @@ yuyv_to_yuv_soa(LLVMBuilderRef builder,
     * v = (yuyv >> 24  ) & 0xff
     */
 
-   shift = LLVMBuildMul(builder, i, lp_build_const_int_vec(type, 16), "");
-   *y = LLVMBuildLShr(builder, packed, shift, "");
+#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
+   /*
+    * Avoid shift with per-element count.
+    * No support on x86, gets translated to roughly 5 instructions
+    * per element. Didn't measure performance but cuts shader size
+    * by quite a bit (less difference if cpu has no sse4.1 support).
+    */
+   if (util_cpu_caps.has_sse2 && n == 4) {
+      LLVMValueRef sel, tmp;
+      struct lp_build_context bld32;
+
+      lp_build_context_init(&bld32, builder, type);
+
+      tmp = LLVMBuildLShr(builder, packed, lp_build_const_int_vec(type, 16), "");
+      sel = lp_build_compare(builder, type, PIPE_FUNC_EQUAL, i, lp_build_const_int_vec(type, 0));
+       *y = lp_build_select(&bld32, sel, packed, tmp);
+   } else
+#endif
+   {
+      LLVMValueRef shift;
+      shift = LLVMBuildMul(builder, i, lp_build_const_int_vec(type, 16), "");
+      *y = LLVMBuildLShr(builder, packed, shift, "");
+   }
+
    *u = LLVMBuildLShr(builder, packed, lp_build_const_int_vec(type, 8), "");
    *v = LLVMBuildLShr(builder, packed, lp_build_const_int_vec(type, 24), "");