[pulseaudio-discuss] [PATCH 1/4] core: add ARM NEON optimized mono-to-stereo remapping code to build

Peter Meerwald pmeerw at pmeerw.net
Thu Jan 12 08:20:06 PST 2012


From: Peter Meerwald <p.meerwald at bct-electronic.com>

---
 src/Makefile.am            |    1 +
 src/pulsecore/remap_neon.c |  212 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 213 insertions(+), 0 deletions(-)
 create mode 100644 src/pulsecore/remap_neon.c

diff --git a/src/Makefile.am b/src/Makefile.am
index 02635fa..9211ec5 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -810,6 +810,7 @@ libpulsecore_ at PA_MAJORMINOR@_la_SOURCES = \
 		pulsecore/play-memchunk.c pulsecore/play-memchunk.h \
 		pulsecore/remap.c pulsecore/remap.h \
 		pulsecore/remap_mmx.c pulsecore/remap_sse.c \
+		pulsecore/remap_neon.c \
 		pulsecore/resampler.c pulsecore/resampler.h \
 		pulsecore/rtpoll.c pulsecore/rtpoll.h \
 		pulsecore/sample-util.c pulsecore/sample-util.h \
diff --git a/src/pulsecore/remap_neon.c b/src/pulsecore/remap_neon.c
new file mode 100644
index 0000000..b6377b5
--- /dev/null
+++ b/src/pulsecore/remap_neon.c
@@ -0,0 +1,212 @@
+/***
+  This file is part of PulseAudio.
+
+  Copyright 2012 Peter Meerwald <p.meerwald at bct-electronic.com>
+
+  PulseAudio is free software; you can redistribute it and/or modify
+  it under the terms of the GNU Lesser General Public License as published
+  by the Free Software Foundation; either version 2.1 of the License,
+  or (at your option) any later version.
+
+  PulseAudio is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+  General Public License for more details.
+
+  You should have received a copy of the GNU Lesser General Public License
+  along with PulseAudio; if not, write to the Free Software
+  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+  USA.
+***/
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <pulse/rtclock.h>
+#include <pulse/sample.h>
+#include <pulsecore/log.h>
+#include <pulsecore/macro.h>
+
+#include "cpu-arm.h"
+#include "remap.h"
+
+#if defined(__ARM_NEON__)
+
+#include <math.h>
+#include <arm_neon.h>
+
+static void remap_mono_to_stereo_neon(pa_remap_t *m, void *dst, const void *src, unsigned n) {
+    unsigned i;
+    switch (*m->format) {
+        case PA_SAMPLE_FLOAT32NE:
+        {
+            float *d = (float *) dst, *s = (float *) src;
+
+            for (i = 0; i < n/4; i++) {
+                float32x4x2_t stereo;
+                stereo.val[0] = vld1q_f32(s);
+                stereo.val[1] = stereo.val[0];
+                vst2q_f32(d, stereo);
+                s += 4;
+                d += 8;
+            }
+
+            for (i = n & ~3; i < n; i++) {
+                d[0] = d[1] = s[0];
+                s++;
+                d += 2;
+            }
+            break;
+        }
+        case PA_SAMPLE_S16NE:
+        {
+            int16_t *d = (int16_t *) dst, *s = (int16_t *) src;
+
+            for (i = 0; i < n/8; i++) {
+                int16x8x2_t stereo;
+                stereo.val[0] = vld1q_s16(s);
+                stereo.val[1] = stereo.val[0];
+                vst2q_s16(d, stereo);
+                s += 8;
+                d += 16;
+            }
+
+            for (i = n & ~7; i < n; i++) {
+                d[0] = d[1] = s[0];
+                s++;
+                d += 2;
+            }
+            break;
+        }
+        default:
+            pa_assert_not_reached();
+    }
+}
+
+#ifdef NO_TEST_YET_SINCE_HARD_TO_CALL_REFERENCE_IMPL
+
+#define SAMPLES 1019
+#define TIMES 1000
+
+static void run_test_float(void) {
+    float stereo[2*SAMPLES];
+    float stereo_ref[2*SAMPLES];
+    float mono[SAMPLES];
+    int i;
+    pa_usec_t start, stop;
+    pa_sample_format_t sf;
+    pa_remap_t remap;
+
+    pa_log_debug("checking NEON remap_mono_to_stereo(float, %d)", SAMPLES);
+
+    memset(stereo_ref, 0, sizeof(stereo_ref));
+    memset(stereo, 0, sizeof(stereo));
+
+    for (i = 0; i < SAMPLES; i++) {
+        mono[i] = rand()/(float) RAND_MAX - 0.5f;
+    }
+
+    sf = PA_SAMPLE_FLOAT32NE;
+    remap.format = &sf;
+    remap_mono_to_stereo_c(&remap, stereo_ref, mono, SAMPLES);
+    remap_mono_to_stereo_neon(&remap, stereo, mono, SAMPLES);
+
+    for (i = 0; i < 2*SAMPLES; i++) {
+        if (fabsf(stereo[i] - stereo_ref[i]) > 0.00001) {
+            pa_log_debug("%d: %.3f != %.3f (%.3f)", i, stereo[i], stereo_ref[i],
+                      mono[i/2]);
+        }
+    }
+
+    start = pa_rtclock_now();
+    for (i = 0; i < TIMES; i++) {
+        remap_mono_to_stereo_neon(&remap, stereo, mono, SAMPLES);
+    }
+    stop = pa_rtclock_now();
+    pa_log_info("NEON: %llu usec.", (long long unsigned int)(stop - start));
+
+    start = pa_rtclock_now();
+    for (i = 0; i < TIMES; i++) {
+        func(&remap, stereo_ref, mono, SAMPLES);
+    }
+    stop = pa_rtclock_now();
+    pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start));
+}
+
+static void run_test_s16(void) {
+    int16_t stereo[2*SAMPLES];
+    int16_t stereo_ref[2*SAMPLES];
+    int16_t mono[SAMPLES];
+    int i;
+    pa_usec_t start, stop;
+    pa_sample_format_t sf;
+    pa_remap_t remap;
+
+    pa_log_debug("checking NEON remap_mono_to_stereo(s16, %d)", SAMPLES);
+
+    memset(stereo_ref, 0, sizeof(stereo_ref));
+    memset(stereo, 0, sizeof(stereo));
+
+    for (i = 0; i < SAMPLES; i++) {
+        mono[i] = rand() - RAND_MAX/2;
+    }
+
+    sf = PA_SAMPLE_S16NE;
+    remap.format = &sf;
+    remap_mono_to_stereo_c(&remap, stereo_ref, mono, SAMPLES);
+    remap_mono_to_stereo_neon(&remap, stereo, mono, SAMPLES);
+
+    for (i = 0; i < 2*SAMPLES; i++) {
+        if (abs(stereo[i] - stereo_ref[i]) > 0) {
+            pa_log_debug("%d: %d != %d (%d)", i, stereo[i], stereo_ref[i],
+                      mono[i/2]);
+        }
+    }
+
+    start = pa_rtclock_now();
+    for (i = 0; i < TIMES; i++) {
+        remap_mono_to_stereo_neon(&remap, stereo, mono, SAMPLES);
+    }
+    stop = pa_rtclock_now();
+    pa_log_info("NEON: %llu usec.", (long long unsigned int)(stop - start));
+
+    start = pa_rtclock_now();
+    for (i = 0; i < TIMES; i++) {
+        func(&remap, stereo_ref, mono, SAMPLES);
+    }
+    stop = pa_rtclock_now();
+    pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start));
+}
+#endif /* NO_TEST_YET_SINCE_HARD_TO_CALL_REFERENCE_IMPL */
+
+static void init_remap_neon(pa_remap_t *m) {
+    unsigned n_oc, n_ic;
+
+    n_oc = m->o_ss->channels;
+    n_ic = m->i_ss->channels;
+
+    /* find some common channel remappings, fall back to full matrix operation. */
+    if (n_ic == 1 && n_oc == 2 &&
+            m->map_table_f[0][0] >= 1.0 && m->map_table_f[1][0] >= 1.0) {
+        m->do_remap = (pa_do_remap_func_t) remap_mono_to_stereo_neon;
+        pa_log_info("Using ARM NEON mono to stereo remapping");
+    }
+}
+#endif /* defined (__ARM_NEON__) */
+
+void pa_remap_func_init_neon(pa_cpu_arm_flag_t flags) {
+#if defined (__ARM_NEON__)
+
+#ifdef RUN_TEST
+    run_test_float();
+    run_test_s16();
+#endif
+
+    if (flags & PA_CPU_ARM_NEON) {
+        pa_log_info("Initialising ARM NEON optimized remappers.");
+        pa_set_init_remap_func((pa_init_remap_func_t) init_remap_neon);
+    }
+
+#endif /* defined (__ARM_NEON__) */
+}
-- 
1.7.4.1



More information about the pulseaudio-discuss mailing list