[pulseaudio-discuss] [PATCH 6/6 v3] core: Add ARM NEON optimized sample conversion code
Peter Meerwald
pmeerw at pmeerw.net
Tue Jul 24 01:20:36 PDT 2012
From: Peter Meerwald <p.meerwald at bct-electronic.com>
v3:
* convert from intrinsics to inline assembly
v2:
* load and store data with vld1/vld1q and vst1/vst1q, resp., to work
around alignment issues of compiler-generated vldmia instruction
* remove redundant check for NEON flags
Ubuntu/Linaro gcc 4.6.3
arm-linux-gnueabi-gcc -O2 -mcpu=cortex-a8 -mfloat-abi=softfp -mfpu=neon
runtime on beagle-xm:
D: [pulseaudio] sconv_neon.c: checking NEON sconv_s16le_from_float
I: [pulseaudio] sconv_neon.c: NEON: 3754 usec.
I: [pulseaudio] sconv_neon.c: ref: 58594 usec.
D: [pulseaudio] sconv_neon.c: checking NEON sconv_s16le_to_float
I: [pulseaudio] sconv_neon.c: NEON: 1831 usec.
I: [pulseaudio] sconv_neon.c: ref: 10528 usec.
I: [pulseaudio] sconv_neon.c: Initialising ARM NEON optimized conversions.
conversion may be off by one for some samples due to rounding issues
Signed-off-by: Peter Meerwald <p.meerwald at bct-electronic.com>
---
src/Makefile.am | 2 +-
src/pulsecore/sconv_neon.c | 209 ++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 210 insertions(+), 1 deletions(-)
create mode 100644 src/pulsecore/sconv_neon.c
diff --git a/src/Makefile.am b/src/Makefile.am
index 6b9df97..0445445 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -832,7 +832,7 @@ libpulsecore_ at PA_MAJORMINOR@_la_SOURCES = \
pulsecore/svolume_neon.c \
pulsecore/sconv-s16be.c pulsecore/sconv-s16be.h \
pulsecore/sconv-s16le.c pulsecore/sconv-s16le.h \
- pulsecore/sconv_sse.c \
+ pulsecore/sconv_sse.c pulsecore/sconv_neon.c \
pulsecore/sconv.c pulsecore/sconv.h \
pulsecore/shared.c pulsecore/shared.h \
pulsecore/sink-input.c pulsecore/sink-input.h \
diff --git a/src/pulsecore/sconv_neon.c b/src/pulsecore/sconv_neon.c
new file mode 100644
index 0000000..94003a0
--- /dev/null
+++ b/src/pulsecore/sconv_neon.c
@@ -0,0 +1,209 @@
+/***
+ This file is part of PulseAudio.
+
+ Copyright 2012 Peter Meerwald <p.meerwald at bct-electronic.com>
+
+ PulseAudio is free software; you can redistribute it and/or modify
+ it under the terms of the GNU Lesser General Public License as published
+ by the Free Software Foundation; either version 2.1 of the License,
+ or (at your option) any later version.
+
+ PulseAudio is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+***/
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <pulse/rtclock.h>
+
+#include <pulsecore/macro.h>
+#include <pulsecore/endianmacros.h>
+
+#include "cpu-arm.h"
+#include "sconv.h"
+
+#if defined(__ARM_NEON__)
+
+#include <math.h>
+#include <arm_neon.h>
+
+#define RUN_TEST
+
+static void pa_sconv_s16le_from_f32ne_neon(unsigned n, const float *src, int16_t *dst) {
+ unsigned i = n & 3;
+
+ asm volatile (
+ "mov %[n], %[n], lsr #2\n\t"
+ "vdup.f32 q2, %[plusone]\n\t"
+ "vneg.f32 q3, q2\n\t"
+ "vdup.f32 q4, %[scale]\n\t"
+ "vdup.u32 q5, %[mask]\n\t"
+ "vdup.f32 q6, %[half]\n\t"
+ "1:\n\t"
+ "vld1.32 {q0}, [%[src]]!\n\t"
+ "vmin.f32 q0, q0, q2\n\t" /* clamp */
+ "vmax.f32 q0, q0, q3\n\t"
+ "vmul.f32 q0, q0, q4\n\t" /* scale */
+ "vand.u32 q1, q0, q5\n\t"
+ "vorr.u32 q1, q1, q6\n\t" /* round */
+ "vadd.f32 q0, q0, q1\n\t"
+ "vcvt.s32.f32 q0, q0\n\t" /* narrow */
+ "vmovn.i32 d0, q0\n\t"
+ "subs %[n], %[n], #1\n\t"
+ "vst1.16 {d0}, [%[dst]]!\n\t"
+ "bgt 1b\n\t"
+ /* output operands (or input operands that get modified) */
+ : [dst] "+r" (dst), [src] "+r" (src), [n] "+r" (n)
+ : [plusone] "r" (1.0f), [scale] "r" (32767.0f),
+ [half] "r" (0.5f), [mask] "r" (0x80000000) /* input operands */
+ : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q6" /* clobber list */
+ );
+
+ // leftovers
+ while (i--) {
+ *dst++ = (int16_t) lrintf(PA_CLAMP_UNLIKELY(*src, -1.0f, 1.0f) * 0x7FFF);
+ src++;
+ }
+}
+
+static void pa_sconv_s16le_to_f32ne_neon(unsigned n, const int16_t *src, float *dst) {
+ unsigned i = n & 3;
+
+ const float invscale = 1.0f / 0x7FFF;
+
+ asm volatile (
+ "mov %[n], %[n], lsr #2\n\t"
+ "vdup.f32 q1, %[invscale]\n\t"
+ "1:\n\t"
+ "vld1.16 {d0}, [%[src]]!\n\t"
+ "vmovl.s16 q0, d0\n\t"
+ "vcvt.f32.s32 q0, q0\n\t"
+ "vmul.f32 q0, q0, q1\n\t"
+ "subs %[n], %[n], #1\n\t"
+ "vst1.32 {q0}, [%[dst]]!\n\t"
+ "bgt 1b\n\t"
+ /* output operands (or input operands that get modified) */
+ : [dst] "+r" (dst), [src] "+r" (src), [n] "+r" (n)
+ : [invscale] "r" (invscale) /* input operands */
+ : "memory", "cc", "q0", "q1" /* clobber list */
+ );
+
+ // leftovers
+ while (i--) {
+ *dst++ = *src++ * invscale;
+ }
+}
+
+#ifdef RUN_TEST
+#define SAMPLES 1019
+#define TIMES 300
+
+static void run_test_from(void) {
+ int16_t samples[SAMPLES];
+ int16_t samples_ref[SAMPLES];
+ float floats[SAMPLES];
+ int i;
+ pa_usec_t start, stop;
+ pa_convert_func_t func;
+
+ pa_log_debug("checking NEON sconv_s16le_from_float");
+
+ memset(samples_ref, 0, sizeof(samples_ref));
+ memset(samples, 0, sizeof(samples));
+
+ for (i = 0; i < SAMPLES; i++) {
+ floats[i] = 2.1f * (rand()/(float) RAND_MAX - 0.5f);
+ }
+
+ func = (pa_convert_func_t) pa_get_convert_from_float32ne_function(PA_SAMPLE_S16LE);
+ func(SAMPLES, floats, samples_ref);
+ pa_sconv_s16le_from_f32ne_neon(SAMPLES, floats, samples);
+
+ for (i = 0; i < SAMPLES; i++) {
+ if (abs(samples[i] - samples_ref[i]) > 0) {
+ pa_log_debug("%d: %d != %d (%f)", i, samples[i], samples_ref[i],
+ floats[i]);
+ break;
+ }
+ }
+
+ start = pa_rtclock_now();
+ for (i = 0; i < TIMES; i++) {
+ pa_sconv_s16le_from_f32ne_neon(SAMPLES, floats, samples);
+ }
+ stop = pa_rtclock_now();
+ pa_log_info("NEON: %llu usec.", (long long unsigned int)(stop - start));
+
+ start = pa_rtclock_now();
+ for (i = 0; i < TIMES; i++) {
+ func(SAMPLES, floats, samples_ref);
+ }
+ stop = pa_rtclock_now();
+ pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start));
+}
+
+static void run_test_to(void) {
+ int16_t samples[SAMPLES];
+ float floats[SAMPLES];
+ float floats_ref[SAMPLES];
+ int i;
+ pa_usec_t start, stop;
+ pa_convert_func_t func;
+
+ pa_log_debug("checking NEON sconv_s16le_to_float");
+
+ memset(floats_ref, 0, sizeof(floats_ref));
+ memset(floats, 0, sizeof(float));
+
+ for (i = 0; i < SAMPLES; i++) {
+ samples[i] = rand() - RAND_MAX/2;
+ }
+
+ func = (pa_convert_func_t) pa_get_convert_to_float32ne_function(PA_SAMPLE_S16LE);
+ func(SAMPLES, samples, floats_ref);
+ pa_sconv_s16le_to_f32ne_neon(SAMPLES, samples, floats);
+
+ for (i = 0; i < SAMPLES; i++) {
+ if (fabsf(floats[i] - floats_ref[i]) > 0.00001) {
+ pa_log_debug("%d: %.8f != %.8f (%d)", i, floats[i], floats_ref[i],
+ samples[i]);
+ break;
+ }
+ }
+
+ start = pa_rtclock_now();
+ for (i = 0; i < TIMES; i++) {
+ pa_sconv_s16le_to_f32ne_neon(SAMPLES, samples, floats);
+ }
+ stop = pa_rtclock_now();
+ pa_log_info("NEON: %llu usec.", (long long unsigned int)(stop - start));
+
+ start = pa_rtclock_now();
+ for (i = 0; i < TIMES; i++) {
+ func(SAMPLES, samples, floats_ref);
+ }
+ stop = pa_rtclock_now();
+ pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start));
+}
+#endif /* RUN_TEST */
+
+#endif /* defined(__ARM_NEON__) */
+
+void pa_convert_func_init_neon(pa_cpu_arm_flag_t flags) {
+#if defined (__ARM_NEON__)
+
+#ifdef RUN_TEST
+ run_test_from();
+ run_test_to();
+#endif
+
+ pa_log_info("Initialising ARM NEON optimized conversions.");
+ pa_set_convert_from_float32ne_function(PA_SAMPLE_S16LE, (pa_convert_func_t) pa_sconv_s16le_from_f32ne_neon);
+ pa_set_convert_to_float32ne_function(PA_SAMPLE_S16LE, (pa_convert_func_t) pa_sconv_s16le_to_f32ne_neon);
+
+#endif /* defined (__ARM_NEON__) */
+}
--
1.7.5.4
More information about the pulseaudio-discuss
mailing list