[pulseaudio-discuss] [PATCH 3/4] core: add ARM NEON optimized sample conversion code
Peter Meerwald
pmeerw at pmeerw.net
Thu Jan 12 08:20:08 PST 2012
From: Peter Meerwald <p.meerwald at bct-electronic.com>
---
src/Makefile.am | 2 +-
src/pulsecore/sconv_neon.c | 187 ++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 188 insertions(+), 1 deletions(-)
create mode 100644 src/pulsecore/sconv_neon.c
diff --git a/src/Makefile.am b/src/Makefile.am
index 645348e..379bd32 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -823,7 +823,7 @@ libpulsecore_ at PA_MAJORMINOR@_la_SOURCES = \
pulsecore/svolume_neon.c \
pulsecore/sconv-s16be.c pulsecore/sconv-s16be.h \
pulsecore/sconv-s16le.c pulsecore/sconv-s16le.h \
- pulsecore/sconv_sse.c \
+ pulsecore/sconv_sse.c pulsecore/sconv_neon.c \
pulsecore/sconv.c pulsecore/sconv.h \
pulsecore/shared.c pulsecore/shared.h \
pulsecore/sink-input.c pulsecore/sink-input.h \
diff --git a/src/pulsecore/sconv_neon.c b/src/pulsecore/sconv_neon.c
new file mode 100644
index 0000000..141e951
--- /dev/null
+++ b/src/pulsecore/sconv_neon.c
@@ -0,0 +1,187 @@
+/***
+ This file is part of PulseAudio.
+
+ Copyright 2012 Peter Meerwald <p.meerwald at bct-electronic.com>
+
+ PulseAudio is free software; you can redistribute it and/or modify
+ it under the terms of the GNU Lesser General Public License as published
+ by the Free Software Foundation; either version 2.1 of the License,
+ or (at your option) any later version.
+
+ PulseAudio is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public License
+ along with PulseAudio; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+ USA.
+***/
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <pulse/rtclock.h>
+
+#include <pulsecore/macro.h>
+#include <pulsecore/endianmacros.h>
+
+#include "cpu-arm.h"
+#include "sconv.h"
+
+#if defined(__ARM_NEON__)
+
+#include <math.h>
+#include <arm_neon.h>
+
+#define RUN_TEST
+
+static void pa_sconv_s16le_from_f32ne_neon(unsigned n, const float *a, int16_t *b) {
+ unsigned i;
+
+ const float32x4_t plusone4 = vdupq_n_f32(1.0f);
+ const float32x4_t minusone4 = vdupq_n_f32(-1.0f);
+ const float32x4_t half4 = vdupq_n_f32(0.5f);
+ const float32x4_t scale4 = vdupq_n_f32(32767.0f);
+ const uint32x4_t mask4 = vdupq_n_u32(0x80000000);
+
+ for (i = 0; i < n/4; i++) {
+ float32x4_t w4;
+ float32x4_t v4 = ((float32x4_t *)a)[i];
+ v4 = vmulq_f32(vmaxq_f32(vminq_f32(v4, plusone4) , minusone4), scale4);
+
+ w4 = vreinterpretq_f32_u32(vorrq_u32(vandq_u32(
+ vreinterpretq_u32_f32(v4), mask4), vreinterpretq_u32_f32(half4)));
+
+ ((int16x4_t *)b)[i] = vmovn_s32(vcvtq_s32_f32(vaddq_f32(v4, w4)));
+ }
+
+ // leftovers
+ for (i = n & ~3; i < n; i++) {
+ b[i] = (int16_t) lrintf(PA_CLAMP_UNLIKELY(a[i], -1.0f, 1.0f) * 0x7FFF);
+ }
+}
+
+static void pa_sconv_s16le_to_f32ne_neon(unsigned n, const int16_t *a, float *b) {
+ unsigned i;
+ const float32x4_t invscale4 = vdupq_n_f32(1.0f / 0x7FFF);
+ const float invscale = 1.0f / 0x7FFF;
+
+ for (i = 0; i < n/4; i++) {
+ ((float32x4_t *)b)[i] = vmulq_f32(vcvtq_f32_s32(vmovl_s16(((int16x4_t *)a)[i])), invscale4);
+ }
+
+ // leftovers
+ for (i = n & ~3; i < n; i++) {
+ b[i] = a[i] * invscale;
+ }
+}
+
+#define SAMPLES 1019
+#define TIMES 300
+
+static void run_test_from(void) {
+ int16_t samples[SAMPLES];
+ int16_t samples_ref[SAMPLES];
+ float floats[SAMPLES];
+ int i;
+ pa_usec_t start, stop;
+ pa_convert_func_t func;
+
+ pa_log_debug("checking NEON sconv_s16le_from_float(%d)", SAMPLES);
+
+ memset(samples_ref, 0, sizeof(samples_ref));
+ memset(samples, 0, sizeof(samples));
+
+ for (i = 0; i < SAMPLES; i++) {
+ floats[i] = 2.1f * (rand()/(float) RAND_MAX - 0.5f);
+ }
+
+ func = (pa_convert_func_t) pa_get_convert_from_float32ne_function(PA_SAMPLE_S16LE);
+ func(SAMPLES, floats, samples_ref);
+ pa_sconv_s16le_from_f32ne_neon(SAMPLES, floats, samples);
+
+ for (i = 0; i < SAMPLES; i++) {
+ if (abs(samples[i] - samples_ref[i]) > 0) {
+ pa_log_debug("%d: %d != %d (%f)", i, samples[i], samples_ref[i],
+ floats[i]);
+ }
+ }
+
+ start = pa_rtclock_now();
+ for (i = 0; i < TIMES; i++) {
+ pa_sconv_s16le_from_f32ne_neon(SAMPLES, floats, samples);
+ }
+ stop = pa_rtclock_now();
+ pa_log_info("NEON: %llu usec.", (long long unsigned int)(stop - start));
+
+ start = pa_rtclock_now();
+ for (i = 0; i < TIMES; i++) {
+ func(SAMPLES, floats, samples_ref);
+ }
+ stop = pa_rtclock_now();
+ pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start));
+}
+
+static void run_test_to(void) {
+ int16_t samples[SAMPLES];
+ float floats[SAMPLES];
+ float floats_ref[SAMPLES];
+ int i;
+ pa_usec_t start, stop;
+ pa_convert_func_t func;
+
+ pa_log_debug("checking NEON sconv_s16le_to_float(%d)", SAMPLES);
+
+ memset(floats_ref, 0, sizeof(floats_ref));
+ memset(floats, 0, sizeof(float));
+
+ for (i = 0; i < SAMPLES; i++) {
+ samples[i] = rand() - RAND_MAX/2;
+ }
+
+ func = (pa_convert_func_t) pa_get_convert_to_float32ne_function(PA_SAMPLE_S16LE);
+ func(SAMPLES, samples, floats_ref);
+ pa_sconv_s16le_to_f32ne_neon(SAMPLES, samples, floats);
+
+ for (i = 0; i < SAMPLES; i++) {
+ if (fabsf(floats[i] - floats_ref[i]) > 0.00001) {
+ pa_log_debug("%d: %.8f != %.8f (%d)", i, floats[i], floats_ref[i],
+ samples[i]);
+ }
+ }
+
+ start = pa_rtclock_now();
+ for (i = 0; i < TIMES; i++) {
+ pa_sconv_s16le_to_f32ne_neon(SAMPLES, samples, floats);
+ }
+ stop = pa_rtclock_now();
+ pa_log_info("NEON: %llu usec.", (long long unsigned int)(stop - start));
+
+ start = pa_rtclock_now();
+ for (i = 0; i < TIMES; i++) {
+ func(SAMPLES, samples, floats_ref);
+ }
+ stop = pa_rtclock_now();
+ pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start));
+}
+
+#endif /* defined(__ARM_NEON__) */
+
+void pa_convert_func_init_neon(pa_cpu_arm_flag_t flags) {
+#if defined (__ARM_NEON__)
+
+#ifdef RUN_TEST
+ run_test_from();
+ run_test_to();
+#endif
+
+ if (flags & PA_CPU_ARM_NEON) {
+ pa_log_info("Initialising ARM NEON optimized conversions.");
+ pa_set_convert_from_float32ne_function(PA_SAMPLE_S16LE, (pa_convert_func_t) pa_sconv_s16le_from_f32ne_neon);
+ pa_set_convert_to_float32ne_function(PA_SAMPLE_S16LE, (pa_convert_func_t) pa_sconv_s16le_to_f32ne_neon);
+ }
+#endif /* defined (__ARM_NEON__) */
+}
--
1.7.4.1
More information about the pulseaudio-discuss
mailing list