[Pixman] [PATCH] ARM: make use of UQADD8 instruction even in generic C code paths

Thu Dec 6 09:45:44 PST 2012

ARMv6 has UQADD8 instruction, which implements unsigned saturated
addition for 8-bit values packed in 32-bit registers. It is very useful
for UN8x4_ADD_UN8x4, UN8_rb_ADD_UN8_rb and ADD_UN8 macros (which would
otherwise need a lot of arithmetic operations to simulate this operation).
Since most of the major ARM linux distros are built for ARMv7, we are
much less dependent on runtime CPU detection and can get practical
benefits from conditional compilation here for a lot of users.

The results of cairo-perf-trace benchmark on ARM Cortex-A15 with pixman
compiled by gcc 4.7.2 and PIXMAN_DISABLE set to "arm-simd arm-neon":

Speedups
========
image    firefox-talos-gfx  (29938.22 0.12%) ->  (27814.76 0.51%) : 1.08x speedup
image    firefox-asteroids  (23241.11 0.07%) ->  (21795.19 0.07%) : 1.07x speedup
image firefox-canvas-alpha (174519.85 0.08%) -> (164788.64 0.20%) : 1.06x speedup
image              poppler   (9464.46 1.61%) ->   (8991.53 0.14%) : 1.05x speedup
---
 pixman/pixman-combine32.h |   47 +++++++++++++++++++++++++++++++++++++++++++++
 1 files changed, 47 insertions(+), 0 deletions(-)

diff --git a/pixman/pixman-combine32.h b/pixman/pixman-combine32.h
index 875dde3..cdd56a6 100644
--- a/pixman/pixman-combine32.h
+++ b/pixman/pixman-combine32.h
@@ -20,6 +20,47 @@
 #define BLUE_8(x) ((x) & MASK)
 
 /*
+ * ARMv6 has UQADD8 instruction, which implements unsigned saturated
+ * addition for 8-bit values packed in 32-bit registers. It is very useful
+ * for UN8x4_ADD_UN8x4, UN8_rb_ADD_UN8_rb and ADD_UN8 macros (which would
+ * otherwise need a lot of arithmetic operations to simulate this operation).
+ * Since most of the major ARM linux distros are built for ARMv7, we are
+ * much less dependent on runtime CPU detection and can get practical
+ * benefits from conditional compilation here for a lot of users.
+ */
+
+#if defined(USE_GCC_INLINE_ASM) && defined(__arm__) && \
+    !defined(__aarch64__) && (!defined(__thumb__) || defined(__thumb2__))
+#if defined(__ARM_ARCH_6__)   || defined(__ARM_ARCH_6J__)  || \
+    defined(__ARM_ARCH_6K__)  || defined(__ARM_ARCH_6Z__)  || \
+    defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) || \
+    defined(__ARM_ARCH_6M__)  || defined(__ARM_ARCH_7__)   || \
+    defined(__ARM_ARCH_7A__)  || defined(__ARM_ARCH_7R__)  || \
+    defined(__ARM_ARCH_7M__)  || defined(__ARM_ARCH_7EM__)
+
+static force_inline uint32_t
+un8x4_add_un8x4 (uint32_t x, uint32_t y)
+{
+    uint32_t t;
+    asm ("uqadd8 %0, %1, %2" : "=r" (t) : "%r" (x), "r" (y));
+    return t;
+}
+
+#define UN8x4_ADD_UN8x4(x, y) \
+    ((x) = un8x4_add_un8x4 ((x), (y)))
+
+#define UN8_rb_ADD_UN8_rb(x, y, t) \
+    ((t) = un8x4_add_un8x4 ((x), (y)), (x) = (t))
+
+#define ADD_UN8(x, y, t) \
+    ((t) = (x), un8x4_add_un8x4 ((t), (y)))
+
+#endif
+#endif
+
+/*****************************************************************************/
+
+/*
  * Helper macros.
  */
 
@@ -29,9 +70,11 @@
 #define DIV_UN8(a, b)							\
     (((uint16_t) (a) * MASK + ((b) / 2)) / (b))
 
+#ifndef ADD_UN8
 #define ADD_UN8(x, y, t)				     \
     ((t) = (x) + (y),					     \
      (uint32_t) (uint8_t) ((t) | (0 - ((t) >> G_SHIFT))))
+#endif
 
 #define DIV_ONE_UN8(x)							\
     (((x) + ONE_HALF + (((x) + ONE_HALF) >> G_SHIFT)) >> G_SHIFT)
@@ -56,6 +99,7 @@
 /*
  * x_rb = min (x_rb + y_rb, 255)
  */
+#ifndef UN8_rb_ADD_UN8_rb
 #define UN8_rb_ADD_UN8_rb(x, y, t)					\
     do									\
     {									\
@@ -63,6 +107,7 @@
 	t |= RB_MASK_PLUS_ONE - ((t >> G_SHIFT) & RB_MASK);		\
 	x = (t & RB_MASK);						\
     } while (0)
+#endif
 
 /*
  * x_rb = (x_rb * a_rb) / 255
@@ -208,6 +253,7 @@
 /*
   x_c = min(x_c + y_c, 255)
 */
+#ifndef UN8x4_ADD_UN8x4
 #define UN8x4_ADD_UN8x4(x, y)						\
     do									\
     {									\
@@ -223,3 +269,4 @@
 									\
 	x = r1__ | (r2__ << G_SHIFT);					\
     } while (0)
+#endif
-- 
1.7.8.6