[pulseaudio-commits] 4 commits - src/Makefile.am src/pulsecore

Sun May 25 09:16:07 PDT 2014

src/Makefile.am            |    6 
 src/pulsecore/cpu-arm.c    |   22 +
 src/pulsecore/cpu-arm.h    |    4 
 src/pulsecore/mix_neon.c   |  131 +++++++++++
 src/pulsecore/remap_neon.c |  498 +++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 651 insertions(+), 10 deletions(-)

New commits:
commit a9d38b50e8cf4d841fe7beebe20c5d1d8bf7f777
Author: Peter Meerwald <p.meerwald at bct-electronic.com>
Date:   Thu Jan 12 13:18:11 2012 +0100

    core: Initialize ARM NEON remapping code if available
    
    Signed-off-by: Peter Meerwald <p.meerwald at bct-electronic.com>

diff --git a/src/pulsecore/cpu-arm.c b/src/pulsecore/cpu-arm.c
index 32f0e53..ad09c87 100644
--- a/src/pulsecore/cpu-arm.c
+++ b/src/pulsecore/cpu-arm.c
@@ -150,10 +150,12 @@ bool pa_cpu_init_arm(pa_cpu_arm_flag_t *flags) {
 
     if (*flags & PA_CPU_ARM_V6)
         pa_volume_func_init_arm(*flags);
+
 #ifdef HAVE_NEON
     if (*flags & PA_CPU_ARM_NEON) {
         pa_convert_func_init_neon(*flags);
         pa_mix_func_init_neon(*flags);
+        pa_remap_func_init_neon(*flags);
     }
 #endif
 
diff --git a/src/pulsecore/cpu-arm.h b/src/pulsecore/cpu-arm.h
index 51f7395..4ee7dff 100644
--- a/src/pulsecore/cpu-arm.h
+++ b/src/pulsecore/cpu-arm.h
@@ -49,6 +49,7 @@ void pa_volume_func_init_arm(pa_cpu_arm_flag_t flags);
 #ifdef HAVE_NEON
 void pa_convert_func_init_neon(pa_cpu_arm_flag_t flags);
 void pa_mix_func_init_neon(pa_cpu_arm_flag_t flags);
+void pa_remap_func_init_neon(pa_cpu_arm_flag_t flags);
 #endif
 
 #endif /* foocpuarmhfoo */

commit 54a10eb915291e186f6b4da7d1ebc06683403600
Author: Peter Meerwald <p.meerwald at bct-electronic.com>
Date:   Wed Apr 16 19:49:35 2014 +0200

    remap: Add ARM NEON optimized remapping and rearrange code
    
    v7:
    * cleanups and reduce code; add 4->4 channels mappings, add rearrange code
    v6:
    * rename mono_to_stereo_float_neon_a9() to mono_to_stereo_float_arm_generic(); note that
    Cortex-A8 and -A9/A15 are different, later chips do not benefit from NEON memory transfers
    v5:
    * 4-channel remapping
    * use vrhadd instruction, fix int16 overflow for to-mono case
    v4:
    * fix for sample length < 4
    v3:
    * fix test code: init float and int map_table
    * different code path for Cortex-A8 and later (-A9, A15, unknown)
    * convert from intrinsics to inline assembly
    v2:
    * add ARM NEON stereo-to-mono remapping code
    * static __attribute__ ((noinline)) is necessary to prevent inlining and
      work around gcc 4.6 ICE, see https://bugs.launchpad.net/bugs/936863
    * call test code, the reference implementation is obtained using
      pa_get_init_remap_func()
    * remove check for NEON flags
    v1:
    * ARM NEON mono-to-stereo remapping code
    
    note that orig is the time of the special-case C implementation where available, not
    the generic matric remapping implementation
    
    on ARM Cortex-A8 (TI OMAP3 DM3730 @ 1GHz) (Linaro GCC 4.6):
    
    Checking NEON remap (float, mono->stereo)
    func: 757474 usec (avg: 7574.74, min = 6165, max = 11963, stddev = 1479.71).
    orig: 784882 usec (avg: 7848.82, min = 6835, max = 17639, stddev = 1656.01).
    Checking NEON remap (float, mono->4-channel)
    func: 1545507 usec (avg: 15455.1, min = 6531, max = 30609, stddev = 2689.6).
    orig: 2601413 usec (avg: 26014.1, min = 22796, max = 52979, stddev = 3281.84).
    Checking NEON remap (s16, mono->stereo)
    func: 343844 usec (avg: 3438.44, min = 1709, max = 8880, stddev = 1180.1).
    orig: 474460 usec (avg: 4744.6, min = 4212, max = 7751, stddev = 1069.29).
    Checking NEON remap (s16, mono->4-channel)
    func: 736574 usec (avg: 7365.74, min = 3784, max = 11902, stddev = 1637.79).
    orig: 1062772 usec (avg: 10627.7, min = 7630, max = 17517, stddev = 3011.44).
    Checking NEON remap (float, stereo->mono)
    func: 571412 usec (avg: 5714.12, min = 4608, max = 15808, stddev = 2131.7).
    orig: 4356630 usec (avg: 43566.3, min = 41596, max = 52430, stddev = 2056.79).
    Checking NEON remap (float, 4-channel->mono)
    func: 1443202 usec (avg: 14432, min = 12298, max = 32349, stddev = 3300).
    orig: 9273410 usec (avg: 92734.1, min = 81940, max = 184265, stddev = 23310).
    Checking NEON remap (s16, stereo->mono)
    func: 185761 usec (avg: 1857.61, min = 1556, max = 4975, stddev = 743.681).
    orig: 1204776 usec (avg: 12047.8, min = 10711, max = 16022, stddev = 1596.88).
    Checking NEON remap (s16, 4-channel->mono)
    func: 482912 usec (avg: 4829.12, min = 4241, max = 9980, stddev = 1270.8).
    orig: 1692050 usec (avg: 16920.5, min = 14679, max = 30060, stddev = 2760.7).
    Checking NEON remap (float, 4-channel->4-channel)
    func: 5324471 usec (avg: 53244.7, min = 49774, max = 87036, stddev = 4255.47).
    orig: 73674628 usec (avg: 736746, min = 720338, max = 824128, stddev = 18361.8).
    Checking NEON remap (s16, 4-channel->4-channel)
    func: 5321320 usec (avg: 53213.2, min = 49591, max = 84443, stddev = 3931.49).
    orig: 24122021 usec (avg: 241220, min = 233337, max = 291687, stddev = 9064.31).
    
    Checking NEON remap (float, stereo rearrange)
    func: 1116547 usec (avg: 11165.5, min = 9124, max = 27496, stddev = 3345.63).
    orig: 1385011 usec (avg: 13850.1, min = 12237, max = 18005, stddev = 1793.05).
    Checking NEON remap (s16, stereo rearrange)
    func: 517027 usec (avg: 5170.27, min = 4577, max = 9735, stddev = 1215.23).
    orig: 1208435 usec (avg: 12084.4, min = 10406, max = 25299, stddev = 2512.02).
    Checking NEON remap (float, 4-channel rearrange)
    func: 1564667 usec (avg: 15646.7, min = 13855, max = 20172, stddev = 1766.48).
    orig: 2970000 usec (avg: 29700, min = 26215, max = 45654, stddev = 2351.07).
    Checking NEON remap (s16, 4-channel rearrange)
    func: 1088808 usec (avg: 10888.1, min = 9064, max = 23407, stddev = 2465.82).
    orig: 1908416 usec (avg: 19084.2, min = 16968, max = 22705, stddev = 1637.46).
    
    Signed-off-by: Peter Meerwald <pmeerw at pmeerw.net>

diff --git a/src/Makefile.am b/src/Makefile.am
index 5c2d5bc..1ac8a16 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -923,12 +923,14 @@ libpulsecore_ at PA_MAJORMINOR@_la_LDFLAGS = $(AM_LDFLAGS) -avoid-version
 libpulsecore_ at PA_MAJORMINOR@_la_LIBADD = $(AM_LIBADD) $(LIBLTDL) $(LIBSAMPLERATE_LIBS) $(LIBSPEEX_LIBS) $(LIBSNDFILE_LIBS) $(WINSOCK_LIBS) $(LTLIBICONV) libpulsecommon- at PA_MAJORMINOR@.la libpulse.la libpulsecore-foreign.la
 
 if HAVE_NEON
-noinst_LTLIBRARIES += libpulsecore_sconv_neon.la libpulsecore_mix_neon.la
+noinst_LTLIBRARIES += libpulsecore_sconv_neon.la libpulsecore_mix_neon.la libpulsecore_remap_neon.la
 libpulsecore_sconv_neon_la_SOURCES = pulsecore/sconv_neon.c
 libpulsecore_sconv_neon_la_CFLAGS = $(AM_CFLAGS) $(NEON_CFLAGS)
 libpulsecore_mix_neon_la_SOURCES = pulsecore/mix_neon.c
 libpulsecore_mix_neon_la_CFLAGS = $(AM_CFLAGS) $(NEON_CFLAGS)
-libpulsecore_ at PA_MAJORMINOR@_la_LIBADD += libpulsecore_sconv_neon.la libpulsecore_mix_neon.la
+libpulsecore_remap_neon_la_SOURCES = pulsecore/remap_neon.c
+libpulsecore_remap_neon_la_CFLAGS = $(AM_CFLAGS) $(NEON_CFLAGS)
+libpulsecore_ at PA_MAJORMINOR@_la_LIBADD += libpulsecore_sconv_neon.la libpulsecore_mix_neon.la libpulsecore_remap_neon.la
 endif
 
 ORC_SOURCE += pulsecore/svolume
diff --git a/src/pulsecore/remap_neon.c b/src/pulsecore/remap_neon.c
new file mode 100644
index 0000000..ebacf92
--- /dev/null
+++ b/src/pulsecore/remap_neon.c
@@ -0,0 +1,498 @@
+/***
+  This file is part of PulseAudio.
+
+  Copyright 2013 Peter Meerwald <p.meerwald at bct-electronic.com>
+
+  PulseAudio is free software; you can redistribute it and/or modify
+  it under the terms of the GNU Lesser General Public License as published
+  by the Free Software Foundation; either version 2.1 of the License,
+  or (at your option) any later version.
+
+  PulseAudio is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+  General Public License for more details.
+***/
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <pulse/sample.h>
+#include <pulse/xmalloc.h>
+#include <pulsecore/log.h>
+#include <pulsecore/macro.h>
+
+#include "cpu-arm.h"
+#include "remap.h"
+
+#include <arm_neon.h>
+
+static void remap_mono_to_stereo_float32ne_neon_a8(pa_remap_t *m, float *dst, const float *src, unsigned n) {
+    for (; n >= 4; n -= 4) {
+        __asm__ __volatile__ (
+            "vld1.32    {q0}, [%[src]]!         \n\t"
+            "vmov       q1, q0                  \n\t"
+            "vst2.32    {q0,q1}, [%[dst]]!      \n\t"
+            : [dst] "+r" (dst), [src] "+r" (src) /* output operands */
+            : /* input operands */
+            : "memory", "q0", "q1" /* clobber list */
+        );
+    }
+
+    for (; n > 0; n--) {
+        dst[0] = dst[1] = src[0];
+        src++;
+        dst += 2;
+    }
+}
+
+static void remap_mono_to_stereo_float32ne_generic_arm(pa_remap_t *m, float *dst, const float *src, unsigned n) {
+    for (; n >= 2; n -= 2) {
+        __asm__ __volatile__ (
+            "ldm        %[src]!, {r4,r6}        \n\t"
+            "mov        r5, r4                  \n\t"
+            "mov        r7, r6                  \n\t"
+            "stm        %[dst]!, {r4-r7}        \n\t"
+            : [dst] "+r" (dst), [src] "+r" (src) /* output operands */
+            : /* input operands */
+            : "memory", "r4", "r5", "r6", "r7" /* clobber list */
+        );
+    }
+
+    if (n > 0)
+        dst[0] = dst[1] = src[0];
+}
+
+static void remap_mono_to_stereo_s16ne_neon(pa_remap_t *m, int16_t *dst, const int16_t *src, unsigned n) {
+    for (; n >= 8; n -= 8) {
+        __asm__ __volatile__ (
+            "vld1.16    {q0}, [%[src]]!         \n\t"
+            "vmov       q1, q0                  \n\t"
+            "vst2.16    {q0,q1}, [%[dst]]!      \n\t"
+            : [dst] "+r" (dst), [src] "+r" (src) /* output operands */
+            : /* input operands */
+            : "memory", "q0", "q1" /* clobber list */
+        );
+    }
+
+    for (; n > 0; n--) {
+        dst[0] = dst[1] = src[0];
+        src++;
+        dst += 2;
+    }
+}
+
+static void remap_mono_to_ch4_float32ne_neon(pa_remap_t *m, float *dst, const float *src, unsigned n) {
+    for (; n >= 2; n -= 2) {
+        __asm__ __volatile__ (
+            "vld1.32    {d0}, [%[src]]!         \n\t"
+            "vdup.f32   q1, d0[0]               \n\t"
+            "vdup.f32   q2, d0[1]               \n\t"
+            "vst1.32    {q1,q2}, [%[dst]]!      \n\t"
+            : [dst] "+r" (dst), [src] "+r" (src) /* output operands */
+            : /* input operands */
+            : "memory", "q0", "q1", "q2" /* clobber list */
+        );
+    }
+
+    if (n--)
+        dst[0] = dst[1] = dst[2] = dst[3] = src[0];
+}
+
+static void remap_mono_to_ch4_s16ne_neon(pa_remap_t *m, int16_t *dst, const int16_t *src, unsigned n) {
+    for (; n >= 4; n -= 4) {
+        __asm__ __volatile__ (
+            "vld1.16    {d0}, [%[src]]!         \n\t"
+            "vdup.s16   d1, d0[1]               \n\t"
+            "vdup.s16   d2, d0[2]               \n\t"
+            "vdup.s16   d3, d0[3]               \n\t"
+            "vdup.s16   d0, d0[0]               \n\t"
+            "vst1.16    {d0,d1,d2,d3}, [%[dst]]!\n\t"
+            : [dst] "+r" (dst), [src] "+r" (src) /* output operands */
+            : /* input operands */
+            : "memory", "d0", "d1", "d2", "d3" /* clobber list */
+        );
+    }
+
+    for (; n > 0; n--) {
+        dst[0] = dst[1] = dst[2] = dst[3] = src[0];
+        src++;
+        dst += 4;
+    }
+}
+
+static void remap_stereo_to_mono_float32ne_neon(pa_remap_t *m, float *dst, const float *src, unsigned n) {
+    const float32x4_t halve = vdupq_n_f32(0.5f);
+    for (; n >= 4; n -= 4) {
+        __asm__ __volatile__ (
+            "vld2.32    {q0,q1}, [%[src]]!      \n\t"
+            "vadd.f32   q0, q0, q1              \n\t"
+            "vmul.f32   q0, q0, %q[halve]       \n\t"
+            "vst1.32    {q0}, [%[dst]]!         \n\t"
+            : [dst] "+r" (dst), [src] "+r" (src) /* output operands */
+            : [halve] "w" (halve) /* input operands */
+            : "memory", "q0", "q1" /* clobber list */
+        );
+    }
+
+    for (; n > 0; n--) {
+        dst[0] = (src[0] + src[1])*0.5f;
+        src += 2;
+        dst++;
+    }
+}
+
+static void remap_stereo_to_mono_s16ne_neon(pa_remap_t *m, int16_t *dst, const int16_t *src, unsigned n) {
+    for (; n >= 8; n -= 8) {
+        __asm__ __volatile__ (
+            "vld2.16    {q0,q1}, [%[src]]!      \n\t"
+            "vrhadd.s16 q0, q0, q1              \n\t"
+            "vst1.16    {q0}, [%[dst]]!         \n\t"
+            : [dst] "+r" (dst), [src] "+r" (src) /* output operands */
+            : /* input operands */
+            : "memory", "q0", "q1" /* clobber list */
+        );
+    }
+
+    for (; n > 0; n--) {
+        dst[0] = (src[0] + src[1])/2;
+        src += 2;
+        dst++;
+    }
+}
+
+static void remap_ch4_to_mono_float32ne_neon(pa_remap_t *m, float *dst, const float *src, unsigned n) {
+    const float32x2_t quart = vdup_n_f32(0.25f);
+    for (; n >= 2; n -= 2) {
+        __asm__ __volatile__ (
+            "vld4.32    {d0,d1,d2,d3}, [%[src]]!\n\t"
+            "vadd.f32   d0, d0, d1              \n\t"
+            "vadd.f32   d2, d2, d3              \n\t"
+            "vadd.f32   d0, d0, d2              \n\t"
+            "vmul.f32   d0, d0, %[quart]        \n\t"
+            "vst1.32    {d0}, [%[dst]]!         \n\t"
+            : [dst] "+r" (dst), [src] "+r" (src) /* output operands */
+            : [quart] "w" (quart) /* input operands */
+            : "memory", "d0", "d1", "d2", "d3" /* clobber list */
+        );
+    }
+
+    if (n > 0)
+        dst[0] = (src[0] + src[1] + src[2] + src[3])*0.25f;
+}
+
+static void remap_ch4_to_mono_s16ne_neon(pa_remap_t *m, int16_t *dst, const int16_t *src, unsigned n) {
+    for (; n >= 4; n -= 4) {
+        __asm__ __volatile__ (
+            "vld4.16    {d0,d1,d2,d3}, [%[src]]!\n\t"
+            "vrhadd.s16 d0, d0, d1              \n\t"
+            "vrhadd.s16 d2, d2, d3              \n\t"
+            "vrhadd.s16 d0, d0, d2              \n\t"
+            "vst1.16    {d0}, [%[dst]]!         \n\t"
+            : [dst] "+r" (dst), [src] "+r" (src) /* output operands */
+            : /* input operands */
+            : "memory", "d0", "d1", "d2", "d3" /* clobber list */
+        );
+    }
+
+    for (; n > 0; n--) {
+        dst[0] = (src[0] + src[1] + src[2] + src[3])/4;
+        src += 4;
+        dst++;
+    }
+}
+
+static void remap_ch4_s16ne_neon(pa_remap_t *m, int16_t *dst, const int16_t *src, unsigned n) {
+    int32x4_t *f = m->state;
+    const int32x4_t f0 = f[0], f1 = f[1], f2 = f[2], f3 = f[3];
+
+    for (; n > 0; n--) {
+        __asm__ __volatile__ (
+            "vld1.16    {d0}, [%[src]]!         \n\t"
+            "vmovl.s16  q0, d0                  \n\t"
+            "vdup.s32   q1, d0[0]               \n\t"
+            "vmul.s32   q1, q1, %q[f0]          \n\t"
+            "vdup.s32   q2, d0[1]               \n\t"
+            "vmla.s32   q1, q2, %q[f1]          \n\t"
+            "vdup.s32   q2, d1[0]               \n\t"
+            "vmla.s32   q1, q2, %q[f2]          \n\t"
+            "vdup.s32   q2, d1[1]               \n\t"
+            "vmla.s32   q1, q2, %q[f3]          \n\t"
+            "vqshrn.s32  d2, q1, #16            \n\t"
+            "vst1.32    {d2}, [%[dst]]!         \n\t"
+            : [dst] "+r" (dst), [src] "+r" (src)
+            : [f0] "w" (f0), [f1] "w" (f1), [f2] "w" (f2), [f3] "w" (f3)
+            : "memory", "q0", "q1", "q2"
+        );
+    }
+}
+
+static void remap_ch4_float32ne_neon(pa_remap_t *m, float *dst, const float *src, unsigned n) {
+    float32x4_t *f = m->state;
+    const float32x4_t f0 = f[0], f1 = f[1], f2 = f[2], f3 = f[3];
+
+    for (; n > 0; n--) {
+        __asm__ __volatile__ (
+            "vld1.32    {d0,d1}, [%[src]]!      \n\t"
+            "vdup.f32   q1, d0[0]               \n\t"
+            "vmul.f32   q1, q1, %q[f0]          \n\t"
+            "vdup.f32   q2, d0[1]               \n\t"
+            "vmla.f32   q1, q2, %q[f1]          \n\t"
+            "vdup.f32   q2, d1[0]               \n\t"
+            "vmla.f32   q1, q2, %q[f2]          \n\t"
+            "vdup.f32   q2, d1[1]               \n\t"
+            "vmla.f32   q1, q2, %q[f3]          \n\t"
+            "vst1.32    {d2,d3}, [%[dst]]!      \n\t"
+            : [dst] "+r" (dst), [src] "+r" (src)
+            : [f0] "w" (f0), [f1] "w" (f1), [f2] "w" (f2), [f3] "w" (f3)
+            : "memory", "q0", "q1", "q2"
+        );
+    }
+}
+
+static void remap_arrange_stereo_s16ne_neon(pa_remap_t *m, int16_t *dst, const int16_t *src, unsigned n) {
+    const uint8x8_t t = ((uint8x8_t *) m->state)[0];
+
+    for (; n >= 2; n -= 2) {
+        __asm__ __volatile__ (
+            "vld1.s16   d0, [%[src]]!           \n\t"
+            "vtbl.8     d0, {d0}, %[t]          \n\t"
+            "vst1.s16   d0, [%[dst]]!           \n\t"
+            : [dst] "+r" (dst), [src] "+r" (src) /* output operands */
+            : [t] "w" (t) /* input operands */
+            : "memory", "d0" /* clobber list */
+        );
+    }
+
+    if (n > 0) {
+        __asm__ __volatile__ (
+            "vld1.32   d0[0], [%[src]]!         \n\t"
+            "vtbl.8    d0, {d0}, %[t]           \n\t"
+            "vst1.32   d0[0], [%[dst]]!         \n\t"
+            : [dst] "+r" (dst), [src] "+r" (src) /* output operands */
+            : [t] "w" (t) /* input operands */
+            : "memory", "d0" /* clobber list */
+        );
+    }
+}
+
+static void remap_arrange_ch2_ch4_s16ne_neon(pa_remap_t *m, int16_t *dst, const int16_t *src, unsigned n) {
+    const uint8x8_t t = ((uint8x8_t *) m->state)[0];
+
+    for (; n > 0; n--) {
+        __asm__ __volatile__ (
+            "vld1.32    d0[0], [%[src]]!           \n\t"
+            "vtbl.8     d0, {d0}, %[t]          \n\t"
+            "vst1.s16   d0, [%[dst]]!           \n\t"
+            : [dst] "+r" (dst), [src] "+r" (src) /* output operands */
+            : [t] "w" (t) /* input operands */
+            : "memory", "d0" /* clobber list */
+        );
+    }
+}
+
+static void remap_arrange_ch4_s16ne_neon(pa_remap_t *m, int16_t *dst, const int16_t *src, unsigned n) {
+    const uint8x8_t t = ((uint8x8_t *) m->state)[0];
+
+    for (; n > 0; n--) {
+        __asm__ __volatile__ (
+            "vld1.s16   d0, [%[src]]!           \n\t"
+            "vtbl.8     d0, {d0}, %[t]          \n\t"
+            "vst1.s16   d0, [%[dst]]!           \n\t"
+            : [dst] "+r" (dst), [src] "+r" (src) /* output operands */
+            : [t] "w" (t) /* input operands */
+            : "memory", "d0" /* clobber list */
+        );
+    }
+}
+
+static void remap_arrange_stereo_float32ne_neon(pa_remap_t *m, float *dst, const float *src, unsigned n) {
+    const uint8x8_t t = ((uint8x8_t *)m->state)[0];
+
+    for (; n > 0; n--) {
+        __asm__ __volatile__ (
+            "vld1.f32   d0, [%[src]]!           \n\t"
+            "vtbl.8     d0, {d0}, %[t]          \n\t"
+            "vst1.s16   {d0}, [%[dst]]!         \n\t"
+            : [dst] "+r" (dst), [src] "+r" (src) /* output operands */
+            : [t] "w" (t) /* input operands */
+            : "memory", "d0" /* clobber list */
+        );
+    }
+}
+
+static void remap_arrange_ch2_ch4_float32ne_neon(pa_remap_t *m, float *dst, const float *src, unsigned n) {
+    const uint8x8_t t0 = ((uint8x8_t *)m->state)[0];
+    const uint8x8_t t1 = ((uint8x8_t *)m->state)[1];
+
+    for (; n > 0; n--) {
+        __asm__ __volatile__ (
+            "vld1.f32   d0, [%[src]]!           \n\t"
+            "vtbl.8     d1, {d0}, %[t0]         \n\t"
+            "vtbl.8     d2, {d0}, %[t1]         \n\t"
+            "vst1.s16   {d1,d2}, [%[dst]]!      \n\t"
+            : [dst] "+r" (dst), [src] "+r" (src) /* output operands */
+            : [t0] "w" (t0), [t1] "w" (t1) /* input operands */
+            : "memory", "d0", "d1", "d2" /* clobber list */
+        );
+    }
+}
+
+static void remap_arrange_ch4_float32ne_neon(pa_remap_t *m, float *dst, const float *src, unsigned n) {
+    const uint8x8_t t0 = ((uint8x8_t *)m->state)[0];
+    const uint8x8_t t1 = ((uint8x8_t *)m->state)[1];
+
+    for (; n > 0; n--) {
+        __asm__ __volatile__ (
+            "vld1.f32   {d0,d1}, [%[src]]!      \n\t"
+            "vtbl.8     d2, {d0,d1}, %[t0]      \n\t"
+            "vtbl.8     d3, {d0,d1}, %[t1]      \n\t"
+            "vst1.s16   {d2,d3}, [%[dst]]!      \n\t"
+            : [dst] "+r" (dst), [src] "+r" (src) /* output operands */
+            : [t0] "w" (t0), [t1] "w" (t1) /* input operands */
+            : "memory", "d0", "d1", "d2", "d3" /* clobber list */
+        );
+    }
+}
+
+static pa_cpu_arm_flag_t arm_flags;
+
+static void init_remap_neon(pa_remap_t *m) {
+    unsigned n_oc, n_ic;
+    int8_t arrange[PA_CHANNELS_MAX];
+
+    n_oc = m->o_ss.channels;
+    n_ic = m->i_ss.channels;
+
+    if (n_ic == 1 && n_oc == 2 &&
+            m->map_table_i[0][0] == 0x10000 && m->map_table_i[1][0] == 0x10000) {
+        if (arm_flags & PA_CPU_ARM_CORTEX_A8) {
+
+            pa_log_info("Using ARM NEON/A8 mono to stereo remapping");
+            pa_set_remap_func(m, (pa_do_remap_func_t) remap_mono_to_stereo_s16ne_neon,
+                (pa_do_remap_func_t) remap_mono_to_stereo_float32ne_neon_a8);
+        }
+        else {
+            pa_log_info("Using ARM NEON mono to stereo remapping");
+            pa_set_remap_func(m, (pa_do_remap_func_t) remap_mono_to_stereo_s16ne_neon,
+                (pa_do_remap_func_t) remap_mono_to_stereo_float32ne_generic_arm);
+        }
+    } else if (n_ic == 1 && n_oc == 4 &&
+            m->map_table_i[0][0] == 0x10000 && m->map_table_i[1][0] == 0x10000 &&
+            m->map_table_i[2][0] == 0x10000 && m->map_table_i[3][0] == 0x10000) {
+
+        pa_log_info("Using ARM NEON mono to 4-channel remapping");
+        pa_set_remap_func(m, (pa_do_remap_func_t) remap_mono_to_ch4_s16ne_neon,
+            (pa_do_remap_func_t) remap_mono_to_ch4_float32ne_neon);
+    } else if (n_ic == 2 && n_oc == 1 &&
+            m->map_table_i[0][0] == 0x8000 && m->map_table_i[0][1] == 0x8000) {
+
+        pa_log_info("Using ARM NEON stereo to mono remapping");
+        pa_set_remap_func(m, (pa_do_remap_func_t) remap_stereo_to_mono_s16ne_neon,
+            (pa_do_remap_func_t) remap_stereo_to_mono_float32ne_neon);
+    } else if (n_ic == 4 && n_oc == 1 &&
+            m->map_table_i[0][0] == 0x4000 && m->map_table_i[0][1] == 0x4000 &&
+            m->map_table_i[0][2] == 0x4000 && m->map_table_i[0][3] == 0x4000) {
+
+        pa_log_info("Using ARM NEON 4-channel to mono remapping");
+        pa_set_remap_func(m, (pa_do_remap_func_t) remap_ch4_to_mono_s16ne_neon,
+            (pa_do_remap_func_t) remap_ch4_to_mono_float32ne_neon);
+    } else if (pa_setup_remap_arrange(m, arrange) &&
+        ((n_ic == 2 && n_oc == 2) ||
+         (n_ic == 2 && n_oc == 4) ||
+         (n_ic == 4 && n_oc == 4))) {
+        unsigned o;
+
+        if (n_ic == 2 && n_oc == 2) {
+            pa_log_info("Using NEON stereo arrange remapping");
+            pa_set_remap_func(m, (pa_do_remap_func_t) remap_arrange_stereo_s16ne_neon,
+                (pa_do_remap_func_t) remap_arrange_stereo_float32ne_neon);
+        } else if (n_ic == 2 && n_oc == 4) {
+            pa_log_info("Using NEON 2-channel to 4-channel arrange remapping");
+            pa_set_remap_func(m, (pa_do_remap_func_t) remap_arrange_ch2_ch4_s16ne_neon,
+                (pa_do_remap_func_t) remap_arrange_ch2_ch4_float32ne_neon);
+        } else if (n_ic == 4 && n_oc == 4) {
+            pa_log_info("Using NEON 4-channel arrange remapping");
+            pa_set_remap_func(m, (pa_do_remap_func_t) remap_arrange_ch4_s16ne_neon,
+                (pa_do_remap_func_t) remap_arrange_ch4_float32ne_neon);
+        }
+
+        /* setup state */
+        switch (m->format) {
+        case PA_SAMPLE_S16NE: {
+            uint8x8_t *t = m->state = pa_xnew0(uint8x8_t, 1);
+            for (o = 0; o < 4; o++) {
+                if (arrange[o % n_oc] >= 0) {
+                    /* convert channel index to vtbl indices */
+                    unsigned frame = o / n_oc;
+                    ((uint8_t *) t)[o * 2 + 0] = (frame * n_oc + arrange[o % n_oc]) * 2 + 0;
+                    ((uint8_t *) t)[o * 2 + 1] = (frame * n_oc + arrange[o % n_oc]) * 2 + 1;
+                } else {
+                    /* use invalid table indices to map to 0 */
+                    ((uint8_t *) t)[o * 2 + 0] = 0xff;
+                    ((uint8_t *) t)[o * 2 + 1] = 0xff;
+                }
+            }
+            break;
+        }
+        case PA_SAMPLE_FLOAT32NE: {
+            uint8x8_t *t = m->state = pa_xnew0(uint8x8_t, 2);
+            for (o = 0; o < n_oc; o++) {
+                if (arrange[o] >= 0) {
+                    /* convert channel index to vtbl indices */
+                    ((uint8_t *) t)[o * 4 + 0] = arrange[o] * 4 + 0;
+                    ((uint8_t *) t)[o * 4 + 1] = arrange[o] * 4 + 1;
+                    ((uint8_t *) t)[o * 4 + 2] = arrange[o] * 4 + 2;
+                    ((uint8_t *) t)[o * 4 + 3] = arrange[o] * 4 + 3;
+                } else {
+                    /* use invalid table indices to map to 0 */
+                    ((uint8_t *) t)[o * 4 + 0] = 0xff;
+                    ((uint8_t *) t)[o * 4 + 1] = 0xff;
+                    ((uint8_t *) t)[o * 4 + 2] = 0xff;
+                    ((uint8_t *) t)[o * 4 + 3] = 0xff;
+                }
+            }
+            break;
+        }
+        default:
+            pa_assert_not_reached();
+        }
+    } else if (n_ic == 4 && n_oc == 4) {
+        unsigned i, o;
+
+        pa_log_info("Using ARM NEON 4-channel remapping");
+        pa_set_remap_func(m, (pa_do_remap_func_t) remap_ch4_s16ne_neon,
+            (pa_do_remap_func_t) remap_ch4_float32ne_neon);
+
+        /* setup state */
+        switch (m->format) {
+        case PA_SAMPLE_S16NE: {
+            int32x4_t *f = m->state = pa_xnew0(int32x4_t, 4);
+            for (o = 0; o < 4; o++) {
+                for (i = 0; i < 4; i++) {
+                    ((int *) &f[i])[o] = PA_CLAMP_UNLIKELY(m->map_table_i[o][i], 0, 0x10000);
+                }
+            }
+            break;
+        }
+        case PA_SAMPLE_FLOAT32NE: {
+            float32x4_t *f = m->state = pa_xnew0(float32x4_t, 4);
+            for (o = 0; o < 4; o++) {
+                for (i = 0; i < 4; i++) {
+                    ((float *) &f[i])[o] = PA_CLAMP_UNLIKELY(m->map_table_f[o][i], 0.0f, 1.0f);
+                }
+            }
+            break;
+        }
+        default:
+            pa_assert_not_reached();
+        }
+    }
+}
+
+void pa_remap_func_init_neon(pa_cpu_arm_flag_t flags) {
+    pa_log_info("Initialising ARM NEON optimized remappers.");
+    arm_flags = flags;
+    pa_set_init_remap_func((pa_init_remap_func_t) init_remap_neon);
+}

commit 40450bdbf21b415b54b6fc1d86fcb8b6f372bef8
Author: Peter Meerwald <p.meerwald at bct-electronic.com>
Date:   Thu Jul 5 14:32:59 2012 +0200

    core: Distinguish Cortex processors: A8 vs later (A9, A15)
    
    Signed-off-by: Peter Meerwald <p.meerwald at bct-electronic.com>

diff --git a/src/pulsecore/cpu-arm.c b/src/pulsecore/cpu-arm.c
index cce3b91..32f0e53 100644
--- a/src/pulsecore/cpu-arm.c
+++ b/src/pulsecore/cpu-arm.c
@@ -80,10 +80,9 @@ static char *get_cpuinfo(void) {
 #endif /* defined (__arm__) && defined (__linux__) */
 
 void pa_cpu_get_arm_flags(pa_cpu_arm_flag_t *flags) {
-#if defined (__arm__)
-#if defined (__linux__)
+#if defined (__arm__) && defined (__linux__)
     char *cpuinfo, *line;
-    int arch;
+    int arch, part;
 
     /* We need to read the CPU flags from /proc/cpuinfo because there is no user
      * space support to get the CPU features. This only works on linux AFAIK. */
@@ -104,6 +103,7 @@ void pa_cpu_get_arm_flags(pa_cpu_arm_flag_t *flags) {
 
         pa_xfree(line);
     }
+
     /* get the CPU features */
     if ((line = get_cpuinfo_line(cpuinfo, "Features"))) {
         const char *state = NULL;
@@ -122,16 +122,24 @@ void pa_cpu_get_arm_flags(pa_cpu_arm_flag_t *flags) {
             pa_xfree(current);
         }
     }
+
+    /* get the CPU part number */
+    if ((line = get_cpuinfo_line(cpuinfo, "CPU part"))) {
+        part = strtoul(line, NULL, 0);
+        if (part == 0xc08)
+            *flags |= PA_CPU_ARM_CORTEX_A8;
+        pa_xfree(line);
+    }
     pa_xfree(cpuinfo);
 
-    pa_log_info("CPU flags: %s%s%s%s%s%s",
+    pa_log_info("CPU flags: %s%s%s%s%s%s%s",
           (*flags & PA_CPU_ARM_V6) ? "V6 " : "",
           (*flags & PA_CPU_ARM_V7) ? "V7 " : "",
           (*flags & PA_CPU_ARM_VFP) ? "VFP " : "",
           (*flags & PA_CPU_ARM_EDSP) ? "EDSP " : "",
           (*flags & PA_CPU_ARM_NEON) ? "NEON " : "",
-          (*flags & PA_CPU_ARM_VFPV3) ? "VFPV3 " : "");
-#endif
+          (*flags & PA_CPU_ARM_VFPV3) ? "VFPV3 " : "",
+          (*flags & PA_CPU_ARM_CORTEX_A8) ? "Cortex-A8 " : "");
 #endif
 }
 
diff --git a/src/pulsecore/cpu-arm.h b/src/pulsecore/cpu-arm.h
index 5bc7d3b..51f7395 100644
--- a/src/pulsecore/cpu-arm.h
+++ b/src/pulsecore/cpu-arm.h
@@ -36,7 +36,8 @@ typedef enum pa_cpu_arm_flag {
     PA_CPU_ARM_VFP      = (1 << 2),
     PA_CPU_ARM_EDSP     = (1 << 3),
     PA_CPU_ARM_NEON     = (1 << 4),
-    PA_CPU_ARM_VFPV3    = (1 << 5)
+    PA_CPU_ARM_VFPV3    = (1 << 5),
+    PA_CPU_ARM_CORTEX_A8 = (1 << 6),
 } pa_cpu_arm_flag_t;
 
 void pa_cpu_get_arm_flags(pa_cpu_arm_flag_t *flags);

commit 789da0c0639deb3e1e30426cc483ac851f61931c
Author: Peter Meerwald <pmeerw at pmeerw.net>
Date:   Wed Apr 16 00:47:50 2014 +0200

    mix: Add special-case ARM NEON code for s16 mixing
    
    note that orig is the time of the special-case C implementation where available, not
    the generic matric remapping implementation
    
    on ARM Cortex-A8 (TI OMAP3 DM3730 @ 1GHz) (Linaro GCC 4.6):
    
    Checking NEON mix (s16, stereo)
    func: 2096927 usec (avg: 20969.3, min = 18646, max = 24475, stddev = 1647.36).
    orig: 7113956 usec (avg: 71139.6, min = 65705, max = 102601, stddev = 4475.93).
    Checking NEON mix (s16, 4-channel)
    func: 4093053 usec (avg: 40930.5, min = 39093, max = 48217, stddev = 1862.16).
    orig: 15664104 usec (avg: 156641, min = 149781, max = 218598, stddev = 8819.22).
    Checking NEON mix (s16, mono)
    func: 1139558 usec (avg: 11395.6, min = 9826, max = 25299, stddev = 2495.29).
    orig: 3219118 usec (avg: 32191.2, min = 28412, max = 46509, stddev = 2095.34).
    
    Signed-off-by: Peter Meerwald <pmeerw at pmeerw.net>

diff --git a/src/pulsecore/mix_neon.c b/src/pulsecore/mix_neon.c
index ff05ccf..eb02d81 100644
--- a/src/pulsecore/mix_neon.c
+++ b/src/pulsecore/mix_neon.c
@@ -20,6 +20,7 @@
 
 #include <pulsecore/macro.h>
 #include <pulsecore/endianmacros.h>
+#include <pulsecore/sample-util.h>
 
 #include "cpu-arm.h"
 #include "mix.h"
@@ -79,8 +80,136 @@ static void pa_mix_ch2_s16ne_neon(pa_mix_info streams[], unsigned nstreams, uint
     fallback(streams, nstreams, 2, data, length & mask);
 }
 
+/* special case: mix 2 s16ne streams, 1 channel each */
+static void pa_mix2_ch1_s16ne_neon(pa_mix_info streams[], int16_t *data, unsigned length) {
+    const int16_t *ptr0 = streams[0].ptr;
+    const int16_t *ptr1 = streams[1].ptr;
+
+    int32x4_t sv0, sv1;
+    __asm__ __volatile__ (
+        "vdup.s32    %q[sv0], %[lin0]        \n\t"
+        "vdup.s32    %q[sv1], %[lin1]        \n\t"
+        : [sv0] "=w" (sv0), [sv1] "=w" (sv1)
+        : [lin0] "r" (streams[0].linear[0]), [lin1] "r" (streams[1].linear[0])
+        : /* clobber list */
+    );
+
+    length /= sizeof(int16_t);
+    for (; length >= 4; length -= 4) {
+        __asm__ __volatile__ (
+            "vld1.s16    d0, [%[ptr0]]!      \n\t"
+            "vld1.s16    d2, [%[ptr1]]!      \n\t"
+            "vshll.s16   q0, d0, #15         \n\t"
+            "vshll.s16   q1, d2, #15         \n\t"
+            "vqdmulh.s32 q0, q0, %q[sv0]     \n\t"
+            "vqdmulh.s32 q1, q1, %q[sv1]     \n\t"
+            "vqadd.s32   q0, q0, q1          \n\t"
+            "vqmovn.s32  d0, q0              \n\t"
+            "vst1.s16    d0, [%[data]]!      \n\t"
+            : [ptr0] "+r" (ptr0), [ptr1] "+r" (ptr1), [data] "+r" (data)
+            : [sv0] "w" (sv0), [sv1] "w" (sv1)
+            : "memory", "cc", "q0", "q1" /* clobber list */
+        );
+    }
+
+    for (; length > 0; length--) {
+        int32_t sum = pa_mult_s16_volume(*ptr0++, streams[0].linear[0].i);
+        sum += pa_mult_s16_volume(*ptr1++, streams[1].linear[0].i);
+        *data++ = PA_CLAMP_UNLIKELY(sum, -0x8000, 0x7FFF);
+    }
+}
+
+/* special case: mix 2 s16ne streams, 2 channel each */
+static void pa_mix2_ch2_s16ne_neon(pa_mix_info streams[], int16_t *data, unsigned length) {
+    const int16_t *ptr0 = streams[0].ptr;
+    const int16_t *ptr1 = streams[1].ptr;
+
+    int32x4_t sv0, sv1;
+    __asm__ __volatile__ (
+        "vld1.s32 d0, [%[lin0]]              \n\t"
+        "vmov.s32 d1, d0                     \n\t"
+        "vmov.s32 %q[sv0], q0                \n\t"
+        "vld1.s32 d0, [%[lin1]]              \n\t"
+        "vmov.s32 d1, d0                     \n\t"
+        "vmov.s32 %q[sv1], q0                \n\t"
+        : [sv0] "=w" (sv0), [sv1] "=w" (sv1)
+        : [lin0] "r" (streams[0].linear), [lin1] "r" (streams[1].linear)
+        : "q0" /* clobber list */
+    );
+
+    length /= sizeof(int16_t);
+    for (; length >= 4; length -= 4) {
+        __asm__ __volatile__ (
+            "vld1.s16    d0, [%[ptr0]]!      \n\t"
+            "vld1.s16    d2, [%[ptr1]]!      \n\t"
+            "vshll.s16   q0, d0, #15         \n\t"
+            "vshll.s16   q1, d2, #15         \n\t"
+            "vqdmulh.s32 q0, q0, %q[sv0]     \n\t"
+            "vqdmulh.s32 q1, q1, %q[sv1]     \n\t"
+            "vqadd.s32   q0, q0, q1          \n\t"
+            "vqmovn.s32  d0, q0              \n\t"
+            "vst1.s16    d0, [%[data]]!      \n\t"
+            : [ptr0] "+r" (ptr0), [ptr1] "+r" (ptr1), [data] "+r" (data)
+            : [sv0] "w" (sv0), [sv1] "w" (sv1)
+            : "memory", "cc", "q0", "q1" /* clobber list */
+        );
+    }
+
+    if (length > 0) {
+        int32_t sum;
+
+        sum = pa_mult_s16_volume(*ptr0++, streams[0].linear[0].i);
+        sum += pa_mult_s16_volume(*ptr1++, streams[1].linear[0].i);
+        *data++ = PA_CLAMP_UNLIKELY(sum, -0x8000, 0x7FFF);
+
+        sum = pa_mult_s16_volume(*ptr0++, streams[0].linear[1].i);
+        sum += pa_mult_s16_volume(*ptr1++, streams[1].linear[1].i);
+        *data++ = PA_CLAMP_UNLIKELY(sum, -0x8000, 0x7FFF);
+    }
+}
+
+/* special case: mix 2 s16ne streams, 4 channels each */
+static void pa_mix2_ch4_s16ne_neon(pa_mix_info streams[], int16_t *data, unsigned length) {
+    const int16_t *ptr0 = streams[0].ptr;
+    const int16_t *ptr1 = streams[1].ptr;
+
+    int32x4_t sv0, sv1;
+
+    __asm__ __volatile__ (
+        "vld1.s32 %h[sv0], [%[lin0]]         \n\t"
+        "vld1.s32 %h[sv1], [%[lin1]]         \n\t"
+        : [sv0] "=w" (sv0), [sv1] "=w" (sv1)
+        : [lin0] "r" (streams[0].linear), [lin1] "r" (streams[1].linear)
+        : /* clobber list */
+    );
+
+    length /= sizeof(int16_t);
+    for (; length >= 4; length -= 4) {
+        __asm__ __volatile__ (
+            "vld1.s16    d0, [%[ptr0]]!      \n\t"
+            "vld1.s16    d2, [%[ptr1]]!      \n\t"
+            "vshll.s16   q0, d0, #15         \n\t"
+            "vshll.s16   q1, d2, #15         \n\t"
+            "vqdmulh.s32 q0, q0, %q[sv0]     \n\t"
+            "vqdmulh.s32 q1, q1, %q[sv1]     \n\t"
+            "vqadd.s32   q0, q0, q1          \n\t"
+            "vqmovn.s32  d0, q0              \n\t"
+            "vst1.s16    d0, [%[data]]!      \n\t"
+            : [ptr0] "+r" (ptr0), [ptr1] "+r" (ptr1), [data] "+r" (data)
+            : [sv0] "w" (sv0), [sv1] "w" (sv1)
+            : "memory", "cc", "q0", "q1" /* clobber list */
+        );
+    }
+}
+
 static void pa_mix_s16ne_neon(pa_mix_info streams[], unsigned nstreams, unsigned nchannels, void *data, unsigned length) {
-    if (nchannels == 2)
+    if (nstreams == 2 && nchannels == 2)
+        pa_mix2_ch2_s16ne_neon(streams, data, length);
+    else if (nstreams == 2 && nchannels == 4)
+        pa_mix2_ch4_s16ne_neon(streams, data, length);
+    else if (nstreams == 2 && nchannels == 1)
+        pa_mix2_ch1_s16ne_neon(streams, data, length);
+    else if (nchannels == 2)
         pa_mix_ch2_s16ne_neon(streams, nstreams, data, length);
     else
         fallback(streams, nstreams, nchannels, data, length);