[pulseaudio-discuss] [PATCH v2 2/6] core: add ARM NEON optimized volume code

Peter Meerwald pmeerw at pmeerw.net
Wed Feb 22 03:45:03 PST 2012


From: Peter Meerwald <p.meerwald at bct-electronic.com>

v2:
* load and store data with vld1/vld1q and vst1/vst1a, resp., to work
  around alignment issues of compiler-generated vldmia instruction
* call test code, the reference implementation is obtained using
  pa_get_volume_func()
* remove redundant check for NEON flags

compiled with Ubuntu/Linaro gcc 4.6.1:
arm-linux-gnueabi-gcc-O2 -mcpu=cortex-a8 -mfloat-abi=softfp -mfpu=neon

runtime on beagle-xm, 800 MHz:

checking NEON volume_float32ne
NEON: 10223 usec.
ref: 46480 usec.
checking NEON volume_s16ne
NEON: 8484 usec.
ARM: 339272 usec.
ref: 20203 usec.

---
 src/Makefile.am              |    1 +
 src/pulsecore/svolume_neon.c |  300 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 301 insertions(+), 0 deletions(-)
 create mode 100644 src/pulsecore/svolume_neon.c

diff --git a/src/Makefile.am b/src/Makefile.am
index a6d1644..a6f9640 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -823,6 +823,7 @@ libpulsecore_ at PA_MAJORMINOR@_la_SOURCES = \
 		pulsecore/cpu-orc.c pulsecore/cpu-orc.h \
 		pulsecore/svolume_c.c pulsecore/svolume_arm.c \
 		pulsecore/svolume_mmx.c pulsecore/svolume_sse.c \
+		pulsecore/svolume_neon.c \
 		pulsecore/sconv-s16be.c pulsecore/sconv-s16be.h \
 		pulsecore/sconv-s16le.c pulsecore/sconv-s16le.h \
 		pulsecore/sconv_sse.c \
diff --git a/src/pulsecore/svolume_neon.c b/src/pulsecore/svolume_neon.c
new file mode 100644
index 0000000..32cb509
--- /dev/null
+++ b/src/pulsecore/svolume_neon.c
@@ -0,0 +1,300 @@
+/***
+  This file is part of PulseAudio.
+
+  Copyright 2012 Peter Meerwald <p.meerwald at bct-electronic.com>
+
+  PulseAudio is free software; you can redistribute it and/or modify
+  it under the terms of the GNU Lesser General Public License as published
+  by the Free Software Foundation; either version 2.1 of the License,
+  or (at your option) any later version.
+
+  PulseAudio is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+  General Public License for more details.
+
+  You should have received a copy of the GNU Lesser General Public License
+  along with PulseAudio; if not, write to the Free Software
+  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+  USA.
+***/
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <pulse/rtclock.h>
+
+#include <pulsecore/random.h>
+#include <pulsecore/macro.h>
+
+#include "cpu-arm.h"
+#include "sample-util.h"
+
+#if defined(__ARM_NEON__)
+
+#include <math.h>
+#include <arm_neon.h>
+
+static inline void vol_s16ne_neon(int32x4_t vol4, int16_t *samples, unsigned length) {
+    unsigned i;
+    int16x4_t hi = vshrn_n_s32(vol4, 16);
+    int32x4_t lo = vandq_s32(vol4, vdupq_n_s32(0xFFFF));
+
+    for (i = 0; i < (length & ~7); i += 8) {
+        int16x4_t v1 = vld1_s16(&samples[i]);
+        int16x4_t v2 = vld1_s16(&samples[i+4]);
+
+        int32x4_t t1 = vmull_s16(v1, hi);
+        int32x4_t t2 = vmull_s16(v2, hi);
+
+        int16x4_t r1 = vqmovn_s32(vsraq_n_s32(t1, vmulq_s32(vmovl_s16(v1), lo), 16));
+        int16x4_t r2 = vqmovn_s32(vsraq_n_s32(t2, vmulq_s32(vmovl_s16(v2), lo), 16));
+
+        vst1q_s16(&samples[i], vcombine_s16(r1, r2));
+    }
+}
+
+static void pa_volume_s16ne_neon(int16_t *samples, int32_t *volumes, unsigned channels, unsigned length) {
+    unsigned channel = 0, i;
+    int32x4_t vol4;
+
+    length /= sizeof(int16_t);
+
+    switch (channels) {
+        case 1:
+            vol4 = vdupq_n_s32(*volumes);
+            vol_s16ne_neon(vol4, samples, length);
+
+            for (i = length & ~7; i < length; i++) {
+                int32_t t = samples[i];
+                t = ((t * (*volumes & 0xFFFF)) >> 16) + (t * (*volumes >> 16));
+                samples[i] = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF);
+            }
+            break;
+        case 2:
+            vol4 = vcombine_s32(*(int32x2_t *)volumes, *(int32x2_t *)volumes);
+            vol_s16ne_neon(vol4, samples, length);
+
+            for (i = length & ~7; i < length; i++) {
+                int32_t t = samples[i];
+                int32_t vol = volumes[(channel++) & 1];
+                t = ((t * (vol & 0xFFFF)) >> 16) + (t * (vol >> 16));
+                samples[i] = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF);
+            }
+            break;
+        case 4:
+            vol4 = *(int32x4_t *)volumes;
+            vol_s16ne_neon(vol4, samples, length);
+
+            for (i = length & ~7; i < length; i++) {
+                int32_t t = samples[i];
+                int32_t vol = volumes[(channel++) & 3];
+                t = ((t * (vol & 0xFFFF)) >> 16) + (t * (vol >> 16));
+                samples[i] = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF);
+            }
+            break;
+        default:
+            for (; length; length--) {
+                int32_t t, hi, lo;
+
+                /* Multiplying the 32bit volume factor with the 16bit
+                 * sample might result in an 48bit value. We want to
+                 * do without 64 bit integers and hence do the
+                 * multiplication independently for the HI and LO part
+                 * of the volume. */
+
+                hi = volumes[channel] >> 16;
+                lo = volumes[channel] & 0xFFFF;
+
+                t = (int32_t)(*samples);
+                t = ((t * lo) >> 16) + (t * hi);
+                t = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF);
+                *samples++ = (int16_t) t;
+
+                if (PA_UNLIKELY(++channel >= channels))
+                    channel = 0;
+            }
+            break;
+    }
+}
+
+static void pa_volume_float32ne_neon(float *samples, float *volumes, unsigned channels, unsigned length) {
+    unsigned channel = 0, i;
+    float32x4_t vol4;
+
+    length /= sizeof(float);
+
+    switch (channels) {
+        case 1:
+            vol4 = vdupq_n_f32(*volumes);
+            for (i = 0; i < (length & ~3); i += 4)
+                vst1q_f32(&samples[i], vmulq_f32(vld1q_f32(&samples[i]), vol4));
+
+            for ( ; i < length; i++) {
+                samples[i] *= volumes[0];
+            }
+            break;
+        case 2:
+            vol4 = vcombine_f32(*(float32x2_t *)volumes, *(float32x2_t *)volumes);
+            for (i = 0; i < (length & ~3); i += 4)
+                vst1q_f32(&samples[i], vmulq_f32(vld1q_f32(&samples[i]), vol4));
+
+            for ( ; i < length; i++) {
+                samples[i] *= volumes[channel];
+
+                if (PA_UNLIKELY(++channel >= channels))
+                    channel = 0;
+            }
+            break;
+        case 4:
+            vol4 = *(float32x4_t *)volumes;
+            for (i = 0; i < (length & ~3); i += 4)
+                vst1q_f32(&samples[i], vmulq_f32(vld1q_f32(&samples[i]), vol4));
+
+            for ( ; i < length; i++) {
+                samples[i] *= volumes[channel++];
+            }
+            break;
+        default:
+            for (; length; length--) {
+                *samples++ *= volumes[channel];
+
+                if (PA_UNLIKELY(++channel >= channels))
+                    channel = 0;
+            }
+            break;
+    }
+}
+
+#ifdef RUN_TEST
+#define SAMPLES 1019
+#define TIMES 1000
+#define CHANNELS 1
+#define PADDING 16
+
+static void run_test_float(void) {
+    float floats[SAMPLES];
+    float floats_ref[SAMPLES];
+    float floats_orig[SAMPLES];
+    float volumes[CHANNELS];
+    unsigned i;
+    pa_usec_t start, stop;
+    pa_do_volume_func_t func;
+
+    pa_log_debug("checking NEON volume_float32ne");
+
+    func = pa_get_volume_func(PA_SAMPLE_FLOAT32NE);
+
+    for (i = 0; i < SAMPLES; i++) {
+        floats_orig[i] = rand()/(float) RAND_MAX - 0.5f;
+    }
+    memcpy(floats_ref, floats_orig, sizeof(floats_orig));
+    memcpy(floats, floats_orig, sizeof(floats_orig));
+
+    for (i = 0; i < CHANNELS; i++)
+        volumes[i] = 0.5f * rand() / (float) RAND_MAX;
+
+    pa_volume_float32ne_neon(floats, volumes, CHANNELS, sizeof(floats));
+    pa_volume_float32ne_c(floats_ref, volumes, CHANNELS, sizeof(floats_ref));
+
+    for (i = 0; i < SAMPLES; i++) {
+        if (fabsf(floats[i] - floats_ref[i]) > 0.00001) {
+            pa_log_debug("%d: %.3f != %.3f (%.3f)", i, floats[i], floats_ref[i],
+                      floats_orig[i]);
+        }
+    }
+
+    start = pa_rtclock_now();
+    for (i = 0; i < TIMES; i++) {
+        memcpy(floats, floats_orig, sizeof(floats_orig));
+        pa_volume_float32ne_neon(floats, volumes, CHANNELS, sizeof(floats));
+    }
+    stop = pa_rtclock_now();
+    pa_log_info("NEON: %llu usec.", (long long unsigned int)(stop - start));
+
+    start = pa_rtclock_now();
+    for (i = 0; i < TIMES; i++) {
+        memcpy(floats_ref, floats_orig, sizeof(floats_orig));
+        pa_volume_float32ne_c(floats_ref, volumes, CHANNELS, sizeof(floats_ref));
+    }
+    stop = pa_rtclock_now();
+    pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start));
+}
+
+static void run_test_s16(void) {
+    int16_t samples[SAMPLES];
+    int16_t samples_ref[SAMPLES];
+    int16_t samples_orig[SAMPLES];
+    int32_t volumes[CHANNELS + PADDING];
+    unsigned i, padding;
+    pa_usec_t start, stop;
+    pa_do_volume_func_t func;
+
+    pa_log_debug("checking NEON volume_s16ne");
+
+    func = pa_get_volume_func(PA_SAMPLE_S16NE);
+
+    for (i = 0; i < SAMPLES; i++) {
+        samples_orig[i] = rand() - RAND_MAX/2;
+    }
+    memcpy(samples_ref, samples_orig, sizeof(samples_orig));
+    memcpy(samples, samples_orig, sizeof(samples_orig));
+
+    for (i = 0; i < CHANNELS; i++)
+        volumes[i] = PA_CLAMP_VOLUME(rand() >> 15);
+    for (padding = 0; padding < PADDING; padding++, i++)
+        volumes[i] = volumes[padding];
+
+    pa_volume_s16ne_neon(samples, volumes, CHANNELS, sizeof(samples));
+    func(samples_ref, volumes, CHANNELS, sizeof(samples_ref));
+
+    for (i = 0; i < SAMPLES; i++) {
+        if (abs(samples[i] - samples_ref[i]) > 0) {
+            pa_log_debug("%d: %d != %d (%d)", i, samples[i], samples_ref[i],
+                      samples_orig[i]);
+        }
+    }
+
+    start = pa_rtclock_now();
+    for (i = 0; i < TIMES; i++) {
+        memcpy(samples, samples_orig, sizeof(samples_orig));
+        pa_volume_s16ne_neon(samples, volumes, CHANNELS, sizeof(samples));
+    }
+    stop = pa_rtclock_now();
+    pa_log_info("NEON: %llu usec.", (long long unsigned int)(stop - start));
+
+    start = pa_rtclock_now();
+    for (i = 0; i < TIMES; i++) {
+        memcpy(samples, samples_orig, sizeof(samples_orig));
+        pa_volume_s16ne_arm(samples, volumes, CHANNELS, sizeof(samples));
+    }
+    stop = pa_rtclock_now();
+    pa_log_info("ARM: %llu usec.", (long long unsigned int)(stop - start));
+
+    start = pa_rtclock_now();
+    for (i = 0; i < TIMES; i++) {
+        memcpy(samples_ref, samples_orig, sizeof(samples_orig));
+        func(samples_ref, volumes, CHANNELS, sizeof(samples_ref));
+    }
+    stop = pa_rtclock_now();
+    pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start));
+}
+#endif /* RUN_TEST */
+
+#endif /* defined(__ARM_NEON__) */
+
+void pa_volume_func_init_neon(pa_cpu_arm_flag_t flags) {
+#if defined (__ARM_NEON__)
+
+#ifdef RUN_TEST
+    run_test_float();
+    run_test_s16();
+#endif
+
+    pa_log_info("Initialising ARM NEON optimized volume functions.");
+    pa_set_volume_func(PA_SAMPLE_S16NE, (pa_do_volume_func_t) pa_volume_s16ne_neon);
+    pa_set_volume_func(PA_SAMPLE_FLOAT32NE, (pa_do_volume_func_t) pa_volume_float32ne_neon);
+
+#endif /* defined (__ARM_NEON__) */
+}
-- 
1.7.4.1



More information about the pulseaudio-discuss mailing list