[pulseaudio-discuss] [PATCH 5/6 v3] core: Add ARM NEON optimized volume code

Tue Jul 24 01:20:35 PDT 2012

From: Peter Meerwald <p.meerwald at bct-electronic.com>

v3:
* convert from intrinsics to inline assembly
v2:
* load and store data with vld1/vld1q and vst1/vst1q, resp., to work
  around alignment issues of compiler-generated vldmia instruction
* call test code, the reference implementation is obtained using
  pa_get_volume_func()
* remove redundant check for NEON flags

compiled with Ubuntu/Linaro gcc 4.6.3:
arm-linux-gnueabi-gcc -O2 -mcpu=cortex-a8 -mfloat-abi=softfp -mfpu=neon

runtime on beagle-xm:

D: [pulseaudio] svolume_neon.c: checking NEON volume_float32ne
I: [pulseaudio] svolume_neon.c: NEON: 4669 usec.
I: [pulseaudio] svolume_neon.c: ref: 48462 usec.
D: [pulseaudio] svolume_neon.c: checking NEON volume_s16ne
I: [pulseaudio] svolume_neon.c: NEON: 13946 usec.
I: [pulseaudio] svolume_neon.c: ref: 22004 usec.
I: [pulseaudio] svolume_neon.c: Initialising ARM NEON optimized volume functions.

Signed-off-by: Peter Meerwald <p.meerwald at bct-electronic.com>
---
 src/Makefile.am              |    1 +
 src/pulsecore/svolume_neon.c |  315 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 316 insertions(+), 0 deletions(-)
 create mode 100644 src/pulsecore/svolume_neon.c

diff --git a/src/Makefile.am b/src/Makefile.am
index df25efc..6b9df97 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -829,6 +829,7 @@ libpulsecore_ at PA_MAJORMINOR@_la_SOURCES = \
 		pulsecore/cpu-orc.c pulsecore/cpu-orc.h \
 		pulsecore/svolume_c.c pulsecore/svolume_arm.c \
 		pulsecore/svolume_mmx.c pulsecore/svolume_sse.c \
+		pulsecore/svolume_neon.c \
 		pulsecore/sconv-s16be.c pulsecore/sconv-s16be.h \
 		pulsecore/sconv-s16le.c pulsecore/sconv-s16le.h \
 		pulsecore/sconv_sse.c \
diff --git a/src/pulsecore/svolume_neon.c b/src/pulsecore/svolume_neon.c
new file mode 100644
index 0000000..7789557
--- /dev/null
+++ b/src/pulsecore/svolume_neon.c
@@ -0,0 +1,315 @@
+/***
+  This file is part of PulseAudio.
+
+  Copyright 2012 Peter Meerwald <p.meerwald at bct-electronic.com>
+
+  PulseAudio is free software; you can redistribute it and/or modify
+  it under the terms of the GNU Lesser General Public License as published
+  by the Free Software Foundation; either version 2.1 of the License,
+  or (at your option) any later version.
+
+  PulseAudio is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+  General Public License for more details.
+***/
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <pulse/rtclock.h>
+
+#include <pulsecore/random.h>
+#include <pulsecore/macro.h>
+
+#include "cpu-arm.h"
+#include "sample-util.h"
+
+#if defined(__ARM_NEON__)
+
+#include <math.h>
+#include <arm_neon.h>
+
+#define RUN_TEST
+
+static inline void vol_s16_neon(const uint32x4_t *vol4, int16_t *samples, unsigned length) {
+    asm volatile (
+    "mov        %[length], %[length], lsr #2\n\t"
+    "vld1.s32   {q0}, [%[vol]]\n\t"
+    "vshl.u32   q3, q0, #16\n\t" /* lo */
+    "vshrn.s32  d1, q0, #16\n\t" /* hi */
+    "vshr.u32   q3, q3, #16\n\t"
+    "1:\n\t"
+    "vld1.16	{d0}, [%[samples]]\n\t"
+
+    "vmull.s16  q1, d0, d1\n\t"
+
+    "vmovl.s16  q2, d0\n\t"
+    "vmul.s32   q2, q2, q3\n\t"
+
+    "vsra.s32   q1, q2, #16\n\t"
+    "vmovn.s32  d0, q1\n\t"
+
+    "subs       %[length], %[length], #1\n\t"
+    "vst1.16	{d0}, [%[samples]]!\n\t"
+    "bgt        1b\n\t"
+      /* output operands (or input operands that get modified) */
+    : [samples] "+r" (samples), [length] "+r" (length)
+    : [vol] "r" (vol4) /* input operands */
+    : "memory", "cc", "q0", "q1", "q2", "q3" /* clobber list */
+    );
+}
+
+static inline void vol_float_neon(const float32x4_t *vol4, float *samples, unsigned length) {
+    asm volatile (
+    "mov        %[length], %[length], lsr #2\n\t"
+    "vld1.32    {q1}, [%[vol]]\n\t"
+    "1:\n\t"
+    "vld1.32	{q0}, [%[samples]]\n\t"
+    "vmul.f32   q0, q0, q1\n\t"
+    "subs       %[length], %[length], #1\n\t"
+    "vst1.32	{q0}, [%[samples]]!\n\t"
+    "bgt        1b\n\t"
+      /* output operands (or input operands that get modified) */
+    : [samples] "+r" (samples), [length] "+r" (length)
+    : [vol] "r" (vol4) /* input operands */
+    : "memory", "cc", "q0", "q1" /* clobber list */
+    );
+}
+
+static void pa_volume_s16ne_neon(int16_t *samples, const uint32_t *volumes, unsigned channels, unsigned length) {
+    unsigned channel = 0, i;
+    uint32x4_t vol4;
+
+    length /= sizeof(int16_t);
+
+    switch (channels) {
+        case 1:
+            vol4 = vdupq_n_u32(*volumes);
+            vol_s16_neon(&vol4, samples, length);
+
+            for (i = length & ~3; i < length; i++) {
+                int32_t t = samples[i];
+                t = ((int32_t) (t * (*volumes & 0xFFFF)) >> 16) + (t * (*volumes >> 16));
+                samples[i] = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF);
+            }
+            break;
+        case 2:
+            vol4 = vcombine_u32(*(uint32x2_t *)volumes, *(uint32x2_t *)volumes);
+            vol_s16_neon(&vol4, samples, length);
+
+            for (i = length & ~3; i < length; i++) {
+                int32_t t = samples[i];
+                uint32_t vol = volumes[(channel++) & 1];
+                t = ((int32_t) (t * (vol & 0xFFFF)) >> 16) + (t * (vol >> 16));
+                samples[i] = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF);
+            }
+            break;
+        case 4:
+            vol4 = *(uint32x4_t *)volumes;
+            vol_s16_neon(&vol4, samples, length);
+
+            for (i = length & ~3; i < length; i++) {
+                int32_t t = samples[i];
+                uint32_t vol = volumes[(channel++) & 3];
+                t = ((int32_t) (t * (vol & 0xFFFF)) >> 16) + (t * (vol >> 16));
+                samples[i] = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF);
+            }
+            break;
+        default:
+            for (; length; length--) {
+                int32_t t;
+                uint32_t hi, lo;
+
+                /* Multiplying the 32bit volume factor with the 16bit
+                 * sample might result in an 48bit value. We want to
+                 * do without 64 bit integers and hence do the
+                 * multiplication independently for the HI and LO part
+                 * of the volume. */
+
+                hi = volumes[channel] >> 16;
+                lo = volumes[channel] & 0xFFFF;
+
+                t = (int32_t)(*samples);
+                t = ((int32_t) (t * lo) >> 16) + (t * hi);
+                t = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF);
+                *samples++ = (int16_t) t;
+
+                if (PA_UNLIKELY(++channel >= channels))
+                    channel = 0;
+            }
+            break;
+    }
+}
+
+static void pa_volume_float32ne_neon(float *samples, const float *volumes, unsigned channels, unsigned length) {
+    unsigned channel = 0, i;
+    float32x4_t vol4;
+
+    length /= sizeof(float);
+
+    switch (channels) {
+        case 1:
+            vol4 = vdupq_n_f32(*volumes);
+            vol_float_neon(&vol4, samples, length);
+
+            for (i = length & ~3; i < length; i++) {
+                samples[i] *= volumes[0];
+            }
+            break;
+        case 2:
+            vol4 = vcombine_f32(*(float32x2_t *)volumes, *(float32x2_t *)volumes);
+            vol_float_neon(&vol4, samples, length);
+
+            for (i = length & ~3; i < length; i++) {
+                samples[i] *= volumes[channel];
+
+                if (PA_UNLIKELY(++channel >= channels))
+                    channel = 0;
+            }
+            break;
+        case 4:
+            vol4 = *(float32x4_t *)volumes;
+            vol_float_neon(&vol4, samples, length);
+
+            for (i = length & ~3; i < length; i++) {
+                samples[i] *= volumes[channel++];
+            }
+            break;
+        default:
+            for (; length; length--) {
+                *samples++ *= volumes[channel];
+
+                if (PA_UNLIKELY(++channel >= channels))
+                    channel = 0;
+            }
+            break;
+    }
+}
+
+#ifdef RUN_TEST
+#define SAMPLES 1019
+#define TIMES 1000
+#define CHANNELS 1
+#define PADDING 16
+
+static void run_test_float(void) {
+    float floats[SAMPLES];
+    float floats_ref[SAMPLES];
+    float floats_orig[SAMPLES];
+    float volumes[CHANNELS];
+    unsigned i;
+    pa_usec_t start, stop;
+    pa_do_volume_func_t func;
+
+    pa_log_debug("checking NEON volume_float32ne");
+
+    func = pa_get_volume_func(PA_SAMPLE_FLOAT32NE);
+
+    for (i = 0; i < SAMPLES; i++) {
+        floats_orig[i] = rand()/(float) RAND_MAX - 0.5f;
+    }
+    memcpy(floats_ref, floats_orig, sizeof(floats_orig));
+    memcpy(floats, floats_orig, sizeof(floats_orig));
+
+    for (i = 0; i < CHANNELS; i++)
+        volumes[i] = 0.5f * rand() / (float) RAND_MAX;
+
+    pa_volume_float32ne_neon(floats, volumes, CHANNELS, sizeof(floats));
+    func(floats_ref, volumes, CHANNELS, sizeof(floats_ref));
+
+    for (i = 0; i < SAMPLES; i++) {
+        if (fabsf(floats[i] - floats_ref[i]) > 0.00001) {
+            pa_log_debug("%d: %.3f != %.3f (%.3f)", i, floats[i], floats_ref[i],
+                      floats_orig[i]);
+            break;
+        }
+    }
+
+    start = pa_rtclock_now();
+    for (i = 0; i < TIMES; i++) {
+        memcpy(floats, floats_orig, sizeof(floats_orig));
+        pa_volume_float32ne_neon(floats, volumes, CHANNELS, sizeof(floats));
+    }
+    stop = pa_rtclock_now();
+    pa_log_info("NEON: %llu usec.", (long long unsigned int)(stop - start));
+
+    start = pa_rtclock_now();
+    for (i = 0; i < TIMES; i++) {
+        memcpy(floats_ref, floats_orig, sizeof(floats_orig));
+        func(floats_ref, volumes, CHANNELS, sizeof(floats_ref));
+    }
+    stop = pa_rtclock_now();
+    pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start));
+}
+
+static void run_test_s16(void) {
+    int16_t samples[SAMPLES];
+    int16_t samples_ref[SAMPLES];
+    int16_t samples_orig[SAMPLES];
+    int32_t volumes[CHANNELS + PADDING];
+    unsigned i, padding;
+    pa_usec_t start, stop;
+    pa_do_volume_func_t func;
+
+    pa_log_debug("checking NEON volume_s16ne");
+
+    func = pa_get_volume_func(PA_SAMPLE_S16NE);
+
+    for (i = 0; i < SAMPLES; i++) {
+        samples_orig[i] = rand() - RAND_MAX/2;
+    }
+    memcpy(samples_ref, samples_orig, sizeof(samples_orig));
+    memcpy(samples, samples_orig, sizeof(samples_orig));
+
+    for (i = 0; i < CHANNELS; i++)
+        volumes[i] = PA_CLAMP_VOLUME(rand() >> 15);
+    for (padding = 0; padding < PADDING; padding++, i++)
+        volumes[i] = volumes[padding];
+
+    pa_volume_s16ne_neon(samples, volumes, CHANNELS, sizeof(samples));
+    func(samples_ref, volumes, CHANNELS, sizeof(samples_ref));
+
+    for (i = 0; i < SAMPLES; i++) {
+        if (abs(samples[i] - samples_ref[i]) > 0) {
+            pa_log_debug("%d: %d != %d (%d)", i, samples[i], samples_ref[i],
+                      samples_orig[i]);
+            break;
+        }
+    }
+
+    start = pa_rtclock_now();
+    for (i = 0; i < TIMES; i++) {
+        memcpy(samples, samples_orig, sizeof(samples_orig));
+        pa_volume_s16ne_neon(samples, volumes, CHANNELS, sizeof(samples));
+    }
+    stop = pa_rtclock_now();
+    pa_log_info("NEON: %llu usec.", (long long unsigned int)(stop - start));
+
+    start = pa_rtclock_now();
+    for (i = 0; i < TIMES; i++) {
+        memcpy(samples_ref, samples_orig, sizeof(samples_orig));
+        func(samples_ref, volumes, CHANNELS, sizeof(samples_ref));
+    }
+    stop = pa_rtclock_now();
+    pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start));
+}
+#endif /* RUN_TEST */
+
+#endif /* defined(__ARM_NEON__) */
+
+void pa_volume_func_init_neon(pa_cpu_arm_flag_t flags) {
+#if defined (__ARM_NEON__)
+
+#ifdef RUN_TEST
+    run_test_float();
+    run_test_s16();
+#endif
+
+    pa_log_info("Initialising ARM NEON optimized volume functions.");
+    pa_set_volume_func(PA_SAMPLE_S16NE, (pa_do_volume_func_t) pa_volume_s16ne_neon);
+    pa_set_volume_func(PA_SAMPLE_FLOAT32NE, (pa_do_volume_func_t) pa_volume_float32ne_neon);
+
+#endif /* defined (__ARM_NEON__) */
+}
-- 
1.7.5.4