[pulseaudio-discuss] [PATCH 5/6 v3] core: Add ARM NEON optimized volume code
Peter Meerwald
pmeerw at pmeerw.net
Tue Jul 24 01:20:35 PDT 2012
From: Peter Meerwald <p.meerwald at bct-electronic.com>
v3:
* convert from intrinsics to inline assembly
v2:
* load and store data with vld1/vld1q and vst1/vst1q, resp., to work
around alignment issues of compiler-generated vldmia instruction
* call test code, the reference implementation is obtained using
pa_get_volume_func()
* remove redundant check for NEON flags
compiled with Ubuntu/Linaro gcc 4.6.3:
arm-linux-gnueabi-gcc -O2 -mcpu=cortex-a8 -mfloat-abi=softfp -mfpu=neon
runtime on beagle-xm:
D: [pulseaudio] svolume_neon.c: checking NEON volume_float32ne
I: [pulseaudio] svolume_neon.c: NEON: 4669 usec.
I: [pulseaudio] svolume_neon.c: ref: 48462 usec.
D: [pulseaudio] svolume_neon.c: checking NEON volume_s16ne
I: [pulseaudio] svolume_neon.c: NEON: 13946 usec.
I: [pulseaudio] svolume_neon.c: ref: 22004 usec.
I: [pulseaudio] svolume_neon.c: Initialising ARM NEON optimized volume functions.
Signed-off-by: Peter Meerwald <p.meerwald at bct-electronic.com>
---
src/Makefile.am | 1 +
src/pulsecore/svolume_neon.c | 315 ++++++++++++++++++++++++++++++++++++++++++
2 files changed, 316 insertions(+), 0 deletions(-)
create mode 100644 src/pulsecore/svolume_neon.c
diff --git a/src/Makefile.am b/src/Makefile.am
index df25efc..6b9df97 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -829,6 +829,7 @@ libpulsecore_ at PA_MAJORMINOR@_la_SOURCES = \
pulsecore/cpu-orc.c pulsecore/cpu-orc.h \
pulsecore/svolume_c.c pulsecore/svolume_arm.c \
pulsecore/svolume_mmx.c pulsecore/svolume_sse.c \
+ pulsecore/svolume_neon.c \
pulsecore/sconv-s16be.c pulsecore/sconv-s16be.h \
pulsecore/sconv-s16le.c pulsecore/sconv-s16le.h \
pulsecore/sconv_sse.c \
diff --git a/src/pulsecore/svolume_neon.c b/src/pulsecore/svolume_neon.c
new file mode 100644
index 0000000..7789557
--- /dev/null
+++ b/src/pulsecore/svolume_neon.c
@@ -0,0 +1,315 @@
+/***
+ This file is part of PulseAudio.
+
+ Copyright 2012 Peter Meerwald <p.meerwald at bct-electronic.com>
+
+ PulseAudio is free software; you can redistribute it and/or modify
+ it under the terms of the GNU Lesser General Public License as published
+ by the Free Software Foundation; either version 2.1 of the License,
+ or (at your option) any later version.
+
+ PulseAudio is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+***/
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <pulse/rtclock.h>
+
+#include <pulsecore/random.h>
+#include <pulsecore/macro.h>
+
+#include "cpu-arm.h"
+#include "sample-util.h"
+
+#if defined(__ARM_NEON__)
+
+#include <math.h>
+#include <arm_neon.h>
+
+#define RUN_TEST
+
+static inline void vol_s16_neon(const uint32x4_t *vol4, int16_t *samples, unsigned length) {
+ asm volatile (
+ "mov %[length], %[length], lsr #2\n\t"
+ "vld1.s32 {q0}, [%[vol]]\n\t"
+ "vshl.u32 q3, q0, #16\n\t" /* lo */
+ "vshrn.s32 d1, q0, #16\n\t" /* hi */
+ "vshr.u32 q3, q3, #16\n\t"
+ "1:\n\t"
+ "vld1.16 {d0}, [%[samples]]\n\t"
+
+ "vmull.s16 q1, d0, d1\n\t"
+
+ "vmovl.s16 q2, d0\n\t"
+ "vmul.s32 q2, q2, q3\n\t"
+
+ "vsra.s32 q1, q2, #16\n\t"
+ "vmovn.s32 d0, q1\n\t"
+
+ "subs %[length], %[length], #1\n\t"
+ "vst1.16 {d0}, [%[samples]]!\n\t"
+ "bgt 1b\n\t"
+ /* output operands (or input operands that get modified) */
+ : [samples] "+r" (samples), [length] "+r" (length)
+ : [vol] "r" (vol4) /* input operands */
+ : "memory", "cc", "q0", "q1", "q2", "q3" /* clobber list */
+ );
+}
+
+static inline void vol_float_neon(const float32x4_t *vol4, float *samples, unsigned length) {
+ asm volatile (
+ "mov %[length], %[length], lsr #2\n\t"
+ "vld1.32 {q1}, [%[vol]]\n\t"
+ "1:\n\t"
+ "vld1.32 {q0}, [%[samples]]\n\t"
+ "vmul.f32 q0, q0, q1\n\t"
+ "subs %[length], %[length], #1\n\t"
+ "vst1.32 {q0}, [%[samples]]!\n\t"
+ "bgt 1b\n\t"
+ /* output operands (or input operands that get modified) */
+ : [samples] "+r" (samples), [length] "+r" (length)
+ : [vol] "r" (vol4) /* input operands */
+ : "memory", "cc", "q0", "q1" /* clobber list */
+ );
+}
+
+static void pa_volume_s16ne_neon(int16_t *samples, const uint32_t *volumes, unsigned channels, unsigned length) {
+ unsigned channel = 0, i;
+ uint32x4_t vol4;
+
+ length /= sizeof(int16_t);
+
+ switch (channels) {
+ case 1:
+ vol4 = vdupq_n_u32(*volumes);
+ vol_s16_neon(&vol4, samples, length);
+
+ for (i = length & ~3; i < length; i++) {
+ int32_t t = samples[i];
+ t = ((int32_t) (t * (*volumes & 0xFFFF)) >> 16) + (t * (*volumes >> 16));
+ samples[i] = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF);
+ }
+ break;
+ case 2:
+ vol4 = vcombine_u32(*(uint32x2_t *)volumes, *(uint32x2_t *)volumes);
+ vol_s16_neon(&vol4, samples, length);
+
+ for (i = length & ~3; i < length; i++) {
+ int32_t t = samples[i];
+ uint32_t vol = volumes[(channel++) & 1];
+ t = ((int32_t) (t * (vol & 0xFFFF)) >> 16) + (t * (vol >> 16));
+ samples[i] = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF);
+ }
+ break;
+ case 4:
+ vol4 = *(uint32x4_t *)volumes;
+ vol_s16_neon(&vol4, samples, length);
+
+ for (i = length & ~3; i < length; i++) {
+ int32_t t = samples[i];
+ uint32_t vol = volumes[(channel++) & 3];
+ t = ((int32_t) (t * (vol & 0xFFFF)) >> 16) + (t * (vol >> 16));
+ samples[i] = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF);
+ }
+ break;
+ default:
+ for (; length; length--) {
+ int32_t t;
+ uint32_t hi, lo;
+
+ /* Multiplying the 32bit volume factor with the 16bit
+ * sample might result in an 48bit value. We want to
+ * do without 64 bit integers and hence do the
+ * multiplication independently for the HI and LO part
+ * of the volume. */
+
+ hi = volumes[channel] >> 16;
+ lo = volumes[channel] & 0xFFFF;
+
+ t = (int32_t)(*samples);
+ t = ((int32_t) (t * lo) >> 16) + (t * hi);
+ t = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF);
+ *samples++ = (int16_t) t;
+
+ if (PA_UNLIKELY(++channel >= channels))
+ channel = 0;
+ }
+ break;
+ }
+}
+
+static void pa_volume_float32ne_neon(float *samples, const float *volumes, unsigned channels, unsigned length) {
+ unsigned channel = 0, i;
+ float32x4_t vol4;
+
+ length /= sizeof(float);
+
+ switch (channels) {
+ case 1:
+ vol4 = vdupq_n_f32(*volumes);
+ vol_float_neon(&vol4, samples, length);
+
+ for (i = length & ~3; i < length; i++) {
+ samples[i] *= volumes[0];
+ }
+ break;
+ case 2:
+ vol4 = vcombine_f32(*(float32x2_t *)volumes, *(float32x2_t *)volumes);
+ vol_float_neon(&vol4, samples, length);
+
+ for (i = length & ~3; i < length; i++) {
+ samples[i] *= volumes[channel];
+
+ if (PA_UNLIKELY(++channel >= channels))
+ channel = 0;
+ }
+ break;
+ case 4:
+ vol4 = *(float32x4_t *)volumes;
+ vol_float_neon(&vol4, samples, length);
+
+ for (i = length & ~3; i < length; i++) {
+ samples[i] *= volumes[channel++];
+ }
+ break;
+ default:
+ for (; length; length--) {
+ *samples++ *= volumes[channel];
+
+ if (PA_UNLIKELY(++channel >= channels))
+ channel = 0;
+ }
+ break;
+ }
+}
+
+#ifdef RUN_TEST
+#define SAMPLES 1019
+#define TIMES 1000
+#define CHANNELS 1
+#define PADDING 16
+
+static void run_test_float(void) {
+ float floats[SAMPLES];
+ float floats_ref[SAMPLES];
+ float floats_orig[SAMPLES];
+ float volumes[CHANNELS];
+ unsigned i;
+ pa_usec_t start, stop;
+ pa_do_volume_func_t func;
+
+ pa_log_debug("checking NEON volume_float32ne");
+
+ func = pa_get_volume_func(PA_SAMPLE_FLOAT32NE);
+
+ for (i = 0; i < SAMPLES; i++) {
+ floats_orig[i] = rand()/(float) RAND_MAX - 0.5f;
+ }
+ memcpy(floats_ref, floats_orig, sizeof(floats_orig));
+ memcpy(floats, floats_orig, sizeof(floats_orig));
+
+ for (i = 0; i < CHANNELS; i++)
+ volumes[i] = 0.5f * rand() / (float) RAND_MAX;
+
+ pa_volume_float32ne_neon(floats, volumes, CHANNELS, sizeof(floats));
+ func(floats_ref, volumes, CHANNELS, sizeof(floats_ref));
+
+ for (i = 0; i < SAMPLES; i++) {
+ if (fabsf(floats[i] - floats_ref[i]) > 0.00001) {
+ pa_log_debug("%d: %.3f != %.3f (%.3f)", i, floats[i], floats_ref[i],
+ floats_orig[i]);
+ break;
+ }
+ }
+
+ start = pa_rtclock_now();
+ for (i = 0; i < TIMES; i++) {
+ memcpy(floats, floats_orig, sizeof(floats_orig));
+ pa_volume_float32ne_neon(floats, volumes, CHANNELS, sizeof(floats));
+ }
+ stop = pa_rtclock_now();
+ pa_log_info("NEON: %llu usec.", (long long unsigned int)(stop - start));
+
+ start = pa_rtclock_now();
+ for (i = 0; i < TIMES; i++) {
+ memcpy(floats_ref, floats_orig, sizeof(floats_orig));
+ func(floats_ref, volumes, CHANNELS, sizeof(floats_ref));
+ }
+ stop = pa_rtclock_now();
+ pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start));
+}
+
+static void run_test_s16(void) {
+ int16_t samples[SAMPLES];
+ int16_t samples_ref[SAMPLES];
+ int16_t samples_orig[SAMPLES];
+ int32_t volumes[CHANNELS + PADDING];
+ unsigned i, padding;
+ pa_usec_t start, stop;
+ pa_do_volume_func_t func;
+
+ pa_log_debug("checking NEON volume_s16ne");
+
+ func = pa_get_volume_func(PA_SAMPLE_S16NE);
+
+ for (i = 0; i < SAMPLES; i++) {
+ samples_orig[i] = rand() - RAND_MAX/2;
+ }
+ memcpy(samples_ref, samples_orig, sizeof(samples_orig));
+ memcpy(samples, samples_orig, sizeof(samples_orig));
+
+ for (i = 0; i < CHANNELS; i++)
+ volumes[i] = PA_CLAMP_VOLUME(rand() >> 15);
+ for (padding = 0; padding < PADDING; padding++, i++)
+ volumes[i] = volumes[padding];
+
+ pa_volume_s16ne_neon(samples, volumes, CHANNELS, sizeof(samples));
+ func(samples_ref, volumes, CHANNELS, sizeof(samples_ref));
+
+ for (i = 0; i < SAMPLES; i++) {
+ if (abs(samples[i] - samples_ref[i]) > 0) {
+ pa_log_debug("%d: %d != %d (%d)", i, samples[i], samples_ref[i],
+ samples_orig[i]);
+ break;
+ }
+ }
+
+ start = pa_rtclock_now();
+ for (i = 0; i < TIMES; i++) {
+ memcpy(samples, samples_orig, sizeof(samples_orig));
+ pa_volume_s16ne_neon(samples, volumes, CHANNELS, sizeof(samples));
+ }
+ stop = pa_rtclock_now();
+ pa_log_info("NEON: %llu usec.", (long long unsigned int)(stop - start));
+
+ start = pa_rtclock_now();
+ for (i = 0; i < TIMES; i++) {
+ memcpy(samples_ref, samples_orig, sizeof(samples_orig));
+ func(samples_ref, volumes, CHANNELS, sizeof(samples_ref));
+ }
+ stop = pa_rtclock_now();
+ pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start));
+}
+#endif /* RUN_TEST */
+
+#endif /* defined(__ARM_NEON__) */
+
+void pa_volume_func_init_neon(pa_cpu_arm_flag_t flags) {
+#if defined (__ARM_NEON__)
+
+#ifdef RUN_TEST
+ run_test_float();
+ run_test_s16();
+#endif
+
+ pa_log_info("Initialising ARM NEON optimized volume functions.");
+ pa_set_volume_func(PA_SAMPLE_S16NE, (pa_do_volume_func_t) pa_volume_s16ne_neon);
+ pa_set_volume_func(PA_SAMPLE_FLOAT32NE, (pa_do_volume_func_t) pa_volume_float32ne_neon);
+
+#endif /* defined (__ARM_NEON__) */
+}
--
1.7.5.4
More information about the pulseaudio-discuss
mailing list