[cairo] [PATCH] Add ARMv5 fast paths to pixman
Jeff Muizelaar
jeff at infidigm.net
Thu Aug 28 15:30:22 PDT 2008
Attached patch adds some assembler optimiziation that use ARMv5 SIMD
instructions. The OVER optimization is about 2.5x-2x faster and the ADD
optimization about 6x faster.
Over_8888x8888
old - mean: 2686149.600000 stddev: 23581.296352
new - mean: 1056306.300000 stddev: 4012.517180
Over_8888x8x8888
new - mean: 1479714.950000 stddev: 19549.019352
old - mean: 3032386.900000 stddev: 13282.745309
Add_8x8
new - mean: 92262.200000 stddev: 26247.943512
old - mean: 520405.500000 stddev: 7765.113911
-Jeff
-------------- next part --------------
commit 44d4231272bdf08fac077cdcaeaac1aec0dd1500
Author: Jeff Muizelaar <jmuizelaar at mozilla.com>
Date: Thu Aug 28 13:02:17 2008 -0400
arm-simd
diff --git a/configure.ac b/configure.ac
index 702bed0..7f24db5 100644
--- a/configure.ac
+++ b/configure.ac
@@ -301,6 +301,44 @@ AC_SUBST(VMX_CFLAGS)
AM_CONDITIONAL(USE_VMX, test $have_vmx_intrinsics = yes)
+dnl Check for ARM
+
+have_armv5_simd=no
+AC_MSG_CHECKING(whether to use ARM assembler)
+xserver_save_CFLAGS=$CFLAGS
+CFLAGS="$CFLAGS $ARM_CFLAGS"
+AC_COMPILE_IFELSE([
+int main () {
+ asm("uqadd8 r1, r1, r2");
+ return 0;
+}], have_armv5_simd=yes)
+CFLAGS=$xserver_save_CFLAGS
+
+AC_ARG_ENABLE(arm,
+ [AC_HELP_STRING([--disable-arm],
+ [disable ARM fast paths])],
+ [enable_arm=$enableval], [enable_arm=auto])
+
+if test $enable_arm = no ; then
+ have_armv5_simd=disabled
+fi
+
+if test $have_armv5_simd = yes ; then
+ AC_DEFINE(USE_ARM, 1, [use ARM compiler intrinsics])
+else
+ ARM_CFLAGS=
+fi
+
+AC_MSG_RESULT($have_armv5_simd)
+if test $enable_arm = yes && test $have_armv5_simd = no ; then
+ AC_MSG_ERROR([ARM intrinsics not detected])
+fi
+
+AC_SUBST(ARM_CFLAGS)
+
+AM_CONDITIONAL(USE_ARM, test $have_armv5_simd = yes)
+
+
AC_ARG_ENABLE(gtk,
[AC_HELP_STRING([--enable-gtk],
[enable tests using GTK+ [default=auto]])],
diff --git a/pixman/Makefile.am b/pixman/Makefile.am
index 4f046f1..2cad71a 100644
--- a/pixman/Makefile.am
+++ b/pixman/Makefile.am
@@ -77,3 +77,16 @@ libpixman_sse_la_LIBADD = $(DEP_LIBS)
libpixman_1_la_LIBADD += libpixman-sse.la
endif
+# arm code
+if USE_ARM
+noinst_LTLIBRARIES += libpixman-arm.la
+libpixman_arm_la_SOURCES = \
+ pixman-arm.c \
+ pixman-arm.h \
+ pixman-combine32.h
+libpixman_arm_la_CFLAGS = $(DEP_CFLAGS) $(ARM_CFLAGS)
+libpixman_arm_la_LIBADD = $(DEP_LIBS)
+libpixman_1_la_LIBADD += libpixman-arm.la
+endif
+
+
diff --git a/pixman/pixman-arm.c b/pixman/pixman-arm.c
new file mode 100644
index 0000000..9750730
--- /dev/null
+++ b/pixman/pixman-arm.c
@@ -0,0 +1,312 @@
+/*
+ * Copyright ? 2008 Mozilla Corporation
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Mozilla Corporation not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission. Mozilla Corporation makes no
+ * representations about the suitability of this software for any purpose. It
+ * is provided "as is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ *
+ * Author: Jeff Muizelaar (jeff at infidigm.net)
+ *
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include "pixman-arm.h"
+#include "pixman-combine32.h"
+
+void
+fbCompositeSrcAdd_8000x8000arm (pixman_op_t op,
+ pixman_image_t * pSrc,
+ pixman_image_t * pMask,
+ pixman_image_t * pDst,
+ int16_t xSrc,
+ int16_t ySrc,
+ int16_t xMask,
+ int16_t yMask,
+ int16_t xDst,
+ int16_t yDst,
+ uint16_t width,
+ uint16_t height)
+{
+ uint8_t *dstLine, *dst;
+ uint8_t *srcLine, *src;
+ int dstStride, srcStride;
+ uint16_t w;
+ uint8_t s, d;
+ uint16_t t;
+
+ fbComposeGetStart (pSrc, xSrc, ySrc, uint8_t, srcStride, srcLine, 1);
+ fbComposeGetStart (pDst, xDst, yDst, uint8_t, dstStride, dstLine, 1);
+
+ while (height--)
+ {
+ dst = dstLine;
+ dstLine += dstStride;
+ src = srcLine;
+ srcLine += srcStride;
+ w = width;
+
+ while (w && (unsigned long)dst & 3)
+ {
+ s = *src;
+ d = *dst;
+ t = d + s;
+ /* s = t | (0 - (t >> 8)); */
+ asm("usat %0, #8, %1" : "=r"(s) : "r"(t));
+ *dst = s;
+
+ dst++;
+ src++;
+ w--;
+ }
+
+ while (w >= 4)
+ {
+ asm("uqadd8 %0, %1, %2" : "=r"(*(uint32_t*)dst) : "r"(*(uint32_t*)src), "r"(*(uint32_t*)dst));
+ dst += 4;
+ src += 4;
+ w -= 4;
+ }
+
+ while (w)
+ {
+ s = *src;
+ d = *dst;
+ t = d + s;
+ /* s = t | (0 - (t >> 8)); */
+ asm("usat %0, #8, %1" : "=r"(s) : "r"(t));
+ *dst = s;
+
+ dst++;
+ src++;
+ w--;
+ }
+ }
+
+}
+
+void
+fbCompositeSrc_8888x8888arm (pixman_op_t op,
+ pixman_image_t * pSrc,
+ pixman_image_t * pMask,
+ pixman_image_t * pDst,
+ int16_t xSrc,
+ int16_t ySrc,
+ int16_t xMask,
+ int16_t yMask,
+ int16_t xDst,
+ int16_t yDst,
+ uint16_t width,
+ uint16_t height)
+{
+ uint32_t *dstLine, *dst;
+ uint32_t *srcLine, *src;
+ int dstStride, srcStride;
+ uint16_t w;
+ uint32_t component_mask = 0xff00ff;
+ uint32_t component_half = 0x800080;
+
+ fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
+ fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
+
+ while (height--)
+ {
+ dst = dstLine;
+ dstLine += dstStride;
+ src = srcLine;
+ srcLine += srcStride;
+ w = width;
+
+//#define inner_branch
+ asm volatile (
+ "cmp %[w], #0\n\t"
+ "beq 2f\n\t"
+ "1:\n\t"
+ /* load dest */
+ "ldr r5, [%[src]], #4\n\t"
+#ifdef inner_branch
+ /* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
+ * The 0x0 case also allows us to avoid doing an unecessary data
+ * write which is more valuable so we only check for that */
+ "cmp r5, #0x1000000\n\t"
+ "blt 3f\n\t"
+
+ /* = 255 - alpha */
+ "mvn r8, r5\n\t"
+ "mov r8, r8, lsr #24\n\t"
+
+ "ldr r4, [%[dest]] \n\t"
+
+#else
+ "ldr r4, [%[dest]] \n\t"
+
+ /* = 255 - alpha */
+ "mvn r8, r5\n\t"
+ "mov r8, r8, lsr #24\n\t"
+#endif
+ "and r6, %[component_mask], r4\n\t"
+ "and r7, %[component_mask], r4, lsr #8\n\t"
+
+ /* multiply by 257 and divide by 65536 */
+ "mla r6, r6, r8, %[component_half]\n\t"
+ "mla r7, r7, r8, %[component_half]\n\t"
+
+ "and r8, %[component_mask], r6, lsr #8\n\t"
+ "and r9, %[component_mask], r7, lsr #8\n\t"
+
+ "add r6, r6, r8\n\t"
+ "add r7, r7, r9\n\t"
+
+ "and r6, %[component_mask], r6, lsr #8\n\t"
+ "and r7, %[component_mask], r7, lsr #8\n\t"
+
+ /* recombine */
+ "orr r6, r6, r7, lsl #8\n\t"
+
+ "uqadd8 r5, r6, r5\n\t"
+
+#ifdef inner_branch
+ "3:\n\t"
+
+#endif
+ "str r5, [%[dest]], #4\n\t"
+ /* increment counter and jmp to top */
+ "subs %[w], %[w], #1\n\t"
+ "bne 1b\n\t"
+ "2:\n\t"
+ : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src)
+ : [component_half] "r" (component_half), [component_mask] "r" (component_mask)
+ : "r4", "r5", "r6", "r7", "r8", "r9", "cc", "memory"
+ );
+ }
+}
+
+void
+fbCompositeSrc_8888x8x8888arm (pixman_op_t op,
+ pixman_image_t * pSrc,
+ pixman_image_t * pMask,
+ pixman_image_t * pDst,
+ int16_t xSrc,
+ int16_t ySrc,
+ int16_t xMask,
+ int16_t yMask,
+ int16_t xDst,
+ int16_t yDst,
+ uint16_t width,
+ uint16_t height)
+{
+ uint32_t *dstLine, *dst;
+ uint32_t *srcLine, *src;
+ uint32_t mask;
+ int dstStride, srcStride;
+ uint16_t w;
+ uint32_t component_mask = 0xff00ff;
+ uint32_t component_half = 0x800080;
+
+ fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
+ fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
+
+ fbComposeGetSolid (pMask, mask, pDst->bits.format);
+ mask = (mask) >> 24;
+
+ while (height--)
+ {
+ dst = dstLine;
+ dstLine += dstStride;
+ src = srcLine;
+ srcLine += srcStride;
+ w = width;
+
+//#define inner_branch
+ asm volatile (
+ "cmp %[w], #0\n\t"
+ "beq 2f\n\t"
+ "1:\n\t"
+ /* load dest */
+ "ldr r5, [%[src]], #4\n\t"
+#ifdef inner_branch
+ /* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
+ * The 0x0 case also allows us to avoid doing an unecessary data
+ * write which is more valuable so we only check for that */
+ "cmp r5, #0x1000000\n\t"
+ "blt 3f\n\t"
+
+#endif
+ "ldr r4, [%[dest]] \n\t"
+
+ "and r6, %[component_mask], r5\n\t"
+ "and r7, %[component_mask], r5, lsr #8\n\t"
+
+ /* multiply by alpha (r8) then by 257 and divide by 65536 */
+ "mla r6, r6, %[mask_alpha], %[component_half]\n\t"
+ "mla r7, r7, %[mask_alpha], %[component_half]\n\t"
+
+ "and r8, %[component_mask], r6, lsr #8\n\t"
+ "and r9, %[component_mask], r7, lsr #8\n\t"
+
+ "add r6, r6, r8\n\t"
+ "add r7, r7, r9\n\t"
+
+ "and r6, %[component_mask], r6, lsr #8\n\t"
+ "and r7, %[component_mask], r7, lsr #8\n\t"
+
+ /* recombine */
+ "orr r5, r6, r7, lsl #8\n\t"
+
+ "and r6, %[component_mask], r4\n\t"
+ "and r7, %[component_mask], r4, lsr #8\n\t"
+
+ "mvn r8, r5\n\t"
+ "mov r8, r8, lsr #24\n\t"
+
+ /* multiply by alpha (r8) then by 257 and divide by 65536 */
+ "mla r6, r6, r8, %[component_half]\n\t"
+ "mla r7, r7, r8, %[component_half]\n\t"
+
+ "and r8, %[component_mask], r6, lsr #8\n\t"
+ "and r9, %[component_mask], r7, lsr #8\n\t"
+
+ "add r6, r6, r8\n\t"
+ "add r7, r7, r9\n\t"
+
+ "and r6, %[component_mask], r6, lsr #8\n\t"
+ "and r7, %[component_mask], r7, lsr #8\n\t"
+
+ /* recombine */
+ "orr r6, r6, r7, lsl #8\n\t"
+
+ "uqadd8 r5, r6, r5\n\t"
+
+#ifdef inner_branch
+ "3:\n\t"
+
+#endif
+ "str r5, [%[dest]], #4\n\t"
+ /* increment counter and jmp to top */
+ "subs %[w], %[w], #1\n\t"
+ "bne 1b\n\t"
+ "2:\n\t"
+ : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src)
+ : [component_half] "r" (component_half), [component_mask] "r" (component_mask), [mask_alpha] "r" (mask)
+ : "r4", "r5", "r6", "r7", "r8", "r9", "cc", "memory"
+ );
+ }
+}
+
+
diff --git a/pixman/pixman-arm.h b/pixman/pixman-arm.h
new file mode 100644
index 0000000..06a3121
--- /dev/null
+++ b/pixman/pixman-arm.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright ? 2008 Mozilla Corporation
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Mozilla Corporation not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission. Mozilla Corporation makes no
+ * representations about the suitability of this software for any purpose. It
+ * is provided "as is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ *
+ * Author: Jeff Muizelaar (jeff at infidigm.net)
+ *
+ */
+
+#include "pixman-private.h"
+
+#ifdef USE_ARM
+
+static inline pixman_bool_t pixman_have_arm(void) { return TRUE; }
+
+#else
+#define pixman_have_arm() FALSE
+#endif
+
+#ifdef USE_ARM
+
+void
+fbCompositeSrcAdd_8000x8000arm (pixman_op_t op,
+ pixman_image_t * pSrc,
+ pixman_image_t * pMask,
+ pixman_image_t * pDst,
+ int16_t xSrc,
+ int16_t ySrc,
+ int16_t xMask,
+ int16_t yMask,
+ int16_t xDst,
+ int16_t yDst,
+ uint16_t width,
+ uint16_t height);
+void
+fbCompositeSrc_8888x8888arm (pixman_op_t op,
+ pixman_image_t * pSrc,
+ pixman_image_t * pMask,
+ pixman_image_t * pDst,
+ int16_t xSrc,
+ int16_t ySrc,
+ int16_t xMask,
+ int16_t yMask,
+ int16_t xDst,
+ int16_t yDst,
+ uint16_t width,
+ uint16_t height);
+
+void
+fbCompositeSrc_8888x8x8888arm (pixman_op_t op,
+ pixman_image_t * pSrc,
+ pixman_image_t * pMask,
+ pixman_image_t * pDst,
+ int16_t xSrc,
+ int16_t ySrc,
+ int16_t xMask,
+ int16_t yMask,
+ int16_t xDst,
+ int16_t yDst,
+ uint16_t width,
+ uint16_t height);
+
+#endif /* USE_ARM */
diff --git a/pixman/pixman-pict.c b/pixman/pixman-pict.c
index b918219..05abc82 100644
--- a/pixman/pixman-pict.c
+++ b/pixman/pixman-pict.c
@@ -34,6 +34,7 @@
#include "pixman-mmx.h"
#include "pixman-vmx.h"
#include "pixman-sse.h"
+#include "pixman-arm.h"
#include "pixman-combine32.h"
#ifdef __GNUC__
@@ -1479,6 +1480,18 @@ static const FastPathInfo vmx_fast_paths[] =
};
#endif
+#ifdef USE_ARM
+static const FastPathInfo arm_fast_paths[] =
+{
+ { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_a8r8g8b8, fbCompositeSrc_8888x8888arm, 0 },
+ { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8, PIXMAN_a8r8g8b8, fbCompositeSrc_8888x8x8888arm, NEED_SOLID_MASK },
+ { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8, PIXMAN_x8r8g8b8, fbCompositeSrc_8888x8x8888arm, NEED_SOLID_MASK },
+
+ { PIXMAN_OP_ADD, PIXMAN_a8, PIXMAN_null, PIXMAN_a8, fbCompositeSrcAdd_8000x8000arm, 0 },
+
+ { PIXMAN_OP_NONE },
+};
+#endif
static const FastPathInfo c_fast_paths[] =
{
@@ -1829,6 +1842,12 @@ pixman_image_composite (pixman_op_t op,
if (!info && pixman_have_vmx())
info = get_fast_path (vmx_fast_paths, op, pSrc, pMask, pDst, pixbuf);
#endif
+#ifdef USE_ARM
+
+ if (!info && pixman_have_arm())
+ info = get_fast_path (arm_fast_paths, op, pSrc, pMask, pDst, pixbuf);
+#endif
+
if (!info)
info = get_fast_path (c_fast_paths, op, pSrc, pMask, pDst, pixbuf);
More information about the cairo
mailing list