[gst-devel] [PATCH] audioconvert: add NEON acceleration for some conversions

Mon Aug 10 16:41:20 CEST 2009

1) convert default processing functions to __attribute__((weak)) so they can be overrided with architecture specific accelerated functions (ie. NEON, MMX, Altivec, etc)
2) override gst_audio_quantize_quantize_signed_tpdf_none() to use NEON vector instructions
3) override gst_audio_convert_unpack_float_le() to use NEON vector instructions

This speeds up audioconvert ~10x, at least for the 32b float -> 16b int conversion needed to play AC-3 audio (ie. DVD's) via ALSA.
---
 gst/audioconvert/Makefile.am        |    1 +
 gst/audioconvert/armv7.c            |  209 +++++++++++++++++++++++++++++++++++
 gst/audioconvert/audioconvert.c     |   20 ++--
 gst/audioconvert/gstaudioquantize.c |    4 +-
 gst/audioconvert/gstchannelmix.c    |    4 +-
 5 files changed, 224 insertions(+), 14 deletions(-)
 create mode 100644 gst/audioconvert/armv7.c

diff --git a/gst/audioconvert/Makefile.am b/gst/audioconvert/Makefile.am
index 94978bb..2d273db 100644
--- a/gst/audioconvert/Makefile.am
+++ b/gst/audioconvert/Makefile.am
@@ -5,6 +5,7 @@ libgstaudioconvert_la_SOURCES = \
 	audioconvert.c \
 	gstchannelmix.c \
 	gstaudioquantize.c \
+	armv7.c \
 	plugin.c
 
 libgstaudioconvert_la_CFLAGS = $(GST_PLUGINS_BASE_CFLAGS) $(GST_BASE_CFLAGS) $(GST_CFLAGS)
diff --git a/gst/audioconvert/armv7.c b/gst/audioconvert/armv7.c
new file mode 100644
index 0000000..e39d29d
--- /dev/null
+++ b/gst/audioconvert/armv7.c
@@ -0,0 +1,209 @@
+/* GStreamer
+ *
+ * Copyright (C) 2009 Texas Instruments, Inc - http://www.ti.com/
+ *
+ * Description: NEON/VFP accelerated functions for armv7 architecture
+ *  Created on: Aug 8, 2009
+ *      Author: Rob Clark <rob at ti.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ */
+
+#ifdef __ARM_NEON__
+#include <arm_neon.h>
+#include <string.h>
+
+#include "audioconvert.h"
+
+
+void
+gst_audio_quantize_quantize_signed_tpdf_none (AudioConvertCtx *ctx,
+    gint32 *src, gint32 *dst, gint count)
+{
+  static guint32 state[4] = {
+      0xdeadbeef,
+      0x305b8cc9,
+      0x6c46ec93,
+      0xad13b0cd
+  };
+
+  gint scale = ctx->out_scale;
+  count *= ctx->out.channels;
+
+  if (scale > 0) {
+    guint32 mask = 0xffffffff & (0xffffffff << scale);
+    guint32 bias = (1U << (scale - 1)) >> 1;
+    gint32 dither = (1<<(scale - 1));
+
+    int32x4_t  vrand;
+    uint32x4_t vstate;
+    uint32x4_t v12345;
+    int32x4_t  vtmp;
+    uint32x4_t vmask;
+
+    vstate = vld1q_u32 (state);
+    v12345 = vmovq_n_u32 (12345);
+    vmask  = vmovq_n_u32 (mask);
+
+    /* until we have less 4 words less to process, use vector instructions
+     * to do everything 4x at a time:
+     */
+    for (;;count-=4) {
+      int64x2_t  vtmp_lo;
+      int64x2_t  vtmp_hi;
+      uint32x4_t vstate2;
+      int32x2_t  vrand_lo;
+      int32x2_t  vrand_hi;
+
+      /* generate next eight random words: (see gst_fast_random_uint32())
+       *
+       *    state = state * 1103515245 + 12345
+       */
+      vstate2 = vmulq_n_u32 (vstate, 1103515245);
+      vstate2 = vaddq_u32 (vstate2, v12345);
+      vstate  = vmulq_n_u32 (vstate2, 1103515245);
+      vstate  = vaddq_u32 (vstate2, v12345);
+
+      /* generate next four scaled random values:
+       *
+       *    gint32 start = bias - dither;
+       *    gint32 end = bias + dither - 1;
+       *    gint64 tmp1 = gst_fast_random_uint32 ();
+       *    gint64 tmp2 = gst_fast_random_uint32 ();
+       *    rand = (gint32)(((tmp1+tmp2) * (end - start)) / (1LLU<<32) + start);
+       *
+       * need to split vstate and vstate2 into 2*2 int64x2_t and add....
+       */
+      vstate2 = vaddq_u32 (vstate, vstate2);     /* tmp1+tmp2 */
+      vtmp_lo = vreinterpretq_s64_u64 (          /* * (end-start) */
+          vmull_n_u32 (vget_low_u32 (vstate2), (2*dither) - 1));
+      vtmp_hi = vreinterpretq_s64_u64 (          /* * (end-start) */
+          vmull_n_u32 (vget_high_u32 (vstate2), (2*dither) - 1));
+
+      vtmp_lo = vshrq_n_s64 (vtmp_lo, 32);       /* / (1LLU<<32) */
+      vtmp_hi = vshrq_n_s64 (vtmp_hi, 32);       /* / (1LLU<<32) */
+
+
+      /* now want to put vtmp_hi and vtmp_lo back together..
+       * then add 'start' (bias-dither).. which is negative..
+       */
+      vrand_lo = vmovn_s64 (vtmp_lo);
+      vrand_hi = vmovn_s64 (vtmp_hi);
+      vrand    = vcombine_s32 (vrand_lo, vrand_hi);
+      vrand    = vaddq_s32 (vrand, vmovq_n_s32 (bias-dither));
+
+      /* load next 4 words:
+       */
+      vtmp = vld1q_s32 (src);
+      src += 4;
+
+      /* perform saturating add of random noise... we don't want the
+       * value to wrap around:
+       *
+       * XXX I *think* vqaddq will handle saturation for underflow too..
+       */
+      vtmp = vqaddq_s32 (vtmp, vrand);
+      vtmp = vreinterpretq_s32_u32 (
+          vandq_u32 (vreinterpretq_u32_s32 (vtmp), vmask));
+
+      /* we check for less than four remaining words at the end, before
+       * we store the result back.. the assumption is that it shouldn't
+       * cause a segfault to read past the end of 'src', and there is no
+       * harm in processing a few garbage words.  But we definitely don't
+       * want to write past the end of 'dst'
+       */
+      if (count<4) break;
+
+      /* store 4 words to result:
+       */
+      vst1q_s32 (dst, vtmp);
+      dst += 4;
+    }
+
+    vst1q_u32 (state, vstate);
+
+    /* at this point, we could have 0-3 result bytes in vtmp to write
+     * back out to 'dst':
+     */
+    if (count) {
+      gint32 tmpdst[4];
+      gint32 *tmpp = tmpdst;
+
+      vst1q_s32 (tmpdst, vtmp);
+
+      while (count--) {
+        *dst++ = *tmpp++;
+      }
+    }
+
+  } else {
+    memmove (dst, src, count);
+  }
+}
+
+void
+gst_audio_convert_unpack_float_le (gfloat * src, gint32 * dst, gint s, gint count)
+{
+  float32x4_t vsrc;
+  float32x4_t v05;
+  int32x4_t   vdst;
+
+  v05 = vmovq_n_f32 (0.5);
+
+  for (;;count-=4) {
+
+    /* load next 4 words:
+     */
+    vsrc = vld1q_f32 ((float32_t *)src);
+    src += 4;
+
+    /* convert to int:
+     */
+    vsrc = vmulq_n_f32 (vsrc, 2147483647.0);
+    vsrc = vaddq_f32 (vsrc, v05);
+    vdst = vcvtq_s32_f32 (vsrc);
+
+    /* we check for less than four remaining words at the end, before
+     * we store the result back.. the assumption is that it shouldn't
+     * cause a segfault to read past the end of 'src', and there is no
+     * harm in processing a few garbage words.  But we definitely don't
+     * want to write past the end of 'dst'
+     */
+    if (count<4) break;
+
+    /* store 4 words to result:
+     */
+    vst1q_s32 (dst, vdst);
+    dst += 4;
+  }
+
+  /* at this point, we could have 0-3 result bytes in vtmp to write
+   * back out to 'dst':
+   */
+  if (count) {
+    gint32 tmpdst[4];
+    gint32 *tmpp = tmpdst;
+
+    vst1q_s32 (tmpdst, vdst);
+
+    while (count--) {
+      *dst++ = *tmpp++;
+    }
+  }
+}
+
+
+#endif
diff --git a/gst/audioconvert/audioconvert.c b/gst/audioconvert/audioconvert.c
index 4780324..c18d217 100644
--- a/gst/audioconvert/audioconvert.c
+++ b/gst/audioconvert/audioconvert.c
@@ -38,11 +38,11 @@
  * unpack code
  */
 #define MAKE_UNPACK_FUNC_NAME(name)                                     \
-audio_convert_unpack_##name
+gst_audio_convert_unpack_##name
 
 /* unpack from integer to signed integer 32 */
 #define MAKE_UNPACK_FUNC_II(name, stride, sign, READ_FUNC)              \
-static void                                                             \
+void __attribute__((weak))                                              \
 MAKE_UNPACK_FUNC_NAME (name) (guint8 *src, gint32 *dst,                 \
         gint scale, gint count)                                         \
 {                                                                       \
@@ -54,7 +54,7 @@ MAKE_UNPACK_FUNC_NAME (name) (guint8 *src, gint32 *dst,                 \
 
 /* unpack from float to signed integer 32 */
 #define MAKE_UNPACK_FUNC_FI(name, type, READ_FUNC)                            \
-static void                                                                   \
+void __attribute__((weak))                                                    \
 MAKE_UNPACK_FUNC_NAME (name) (type * src, gint32 * dst, gint s, gint count)   \
 {                                                                             \
   gdouble temp;                                                               \
@@ -68,7 +68,7 @@ MAKE_UNPACK_FUNC_NAME (name) (type * src, gint32 * dst, gint s, gint count)   \
 
 /* unpack from float to float 64 (double) */
 #define MAKE_UNPACK_FUNC_FF(name, type, FUNC)                                 \
-static void                                                                   \
+void __attribute__((weak))                                                    \
 MAKE_UNPACK_FUNC_NAME (name) (type * src, gdouble * dst, gint s,              \
     gint count)                                                               \
 {                                                                             \
@@ -78,7 +78,7 @@ MAKE_UNPACK_FUNC_NAME (name) (type * src, gdouble * dst, gint s,              \
 
 /* unpack from int to float 64 (double) */
 #define MAKE_UNPACK_FUNC_IF(name, stride, sign, READ_FUNC)                    \
-static void                                                                   \
+void __attribute__((weak))                                                    \
 MAKE_UNPACK_FUNC_NAME (name) (guint8 * src, gdouble * dst, gint scale,        \
     gint count)                                                               \
 {                                                                             \
@@ -158,7 +158,7 @@ audio_convert_pack_##name
 
 /* pack from signed integer 32 to integer */
 #define MAKE_PACK_FUNC_II(name, stride, sign, WRITE_FUNC)               \
-static void                                                             \
+void __attribute__((weak))                                              \
 MAKE_PACK_FUNC_NAME (name) (gint32 *src, guint8 * dst,                  \
         gint scale, gint count)                                         \
 {                                                                       \
@@ -172,7 +172,7 @@ MAKE_PACK_FUNC_NAME (name) (gint32 *src, guint8 * dst,                  \
 
 /* pack from signed integer 32 to float */
 #define MAKE_PACK_FUNC_IF(name, type, FUNC)                             \
-static void                                                             \
+void __attribute__((weak))                                              \
 MAKE_PACK_FUNC_NAME (name) (gint32 * src, type * dst, gint scale,       \
     gint count)                                                         \
 {                                                                       \
@@ -182,7 +182,7 @@ MAKE_PACK_FUNC_NAME (name) (gint32 * src, type * dst, gint scale,       \
 
 /* pack from float 64 (double) to float */
 #define MAKE_PACK_FUNC_FF(name, type, FUNC)                             \
-static void                                                             \
+void __attribute__((weak))                                              \
 MAKE_PACK_FUNC_NAME (name) (gdouble * src, type * dst, gint s,          \
     gint count)                                                         \
 {                                                                       \
@@ -194,7 +194,7 @@ MAKE_PACK_FUNC_NAME (name) (gdouble * src, type * dst, gint s,          \
  * the floats are already in the correct range. Only a cast is needed.
  */
 #define MAKE_PACK_FUNC_FI_S(name, stride, WRITE_FUNC)                   \
-static void                                                             \
+void __attribute__((weak))                                              \
 MAKE_PACK_FUNC_NAME (name) (gdouble * src, guint8 * dst, gint scale,    \
     gint count)                                                         \
 {                                                                       \
@@ -212,7 +212,7 @@ MAKE_PACK_FUNC_NAME (name) (gdouble * src, guint8 * dst, gint scale,    \
  * and an addition of 2^(target_depth-1) to get in the correct unsigned
  * range. */
 #define MAKE_PACK_FUNC_FI_U(name, stride, WRITE_FUNC)                   \
-static void                                                             \
+void __attribute__((weak))                                              \
 MAKE_PACK_FUNC_NAME (name) (gdouble * src, guint8 * dst, gint scale,    \
     gint count)                                                         \
 {                                                                       \
diff --git a/gst/audioconvert/gstaudioquantize.c b/gst/audioconvert/gstaudioquantize.c
index 2155397..be959c4 100644
--- a/gst/audioconvert/gstaudioquantize.c
+++ b/gst/audioconvert/gstaudioquantize.c
@@ -46,7 +46,7 @@ gst_audio_quantize_quantize_##name
 
 #define MAKE_QUANTIZE_FUNC_I(name, DITHER_INIT_FUNC, ADD_DITHER_FUNC,   \
                              ROUND_FUNC)                                \
-static void                                                             \
+void __attribute__((weak))                                              \
 MAKE_QUANTIZE_FUNC_NAME (name) (AudioConvertCtx *ctx, gint32 *src,      \
                                 gint32 *dst, gint count)                \
 {                                                                       \
@@ -86,7 +86,7 @@ MAKE_QUANTIZE_FUNC_NAME (name) (AudioConvertCtx *ctx, gint32 *src,      \
 #define MAKE_QUANTIZE_FUNC_F(name, DITHER_INIT_FUNC, NS_INIT_FUNC,      \
                              ADD_NS_FUNC, ADD_DITHER_FUNC,              \
                              UPDATE_ERROR_FUNC)                         \
-static void                                                             \
+void __attribute__((weak))                                              \
 MAKE_QUANTIZE_FUNC_NAME (name) (AudioConvertCtx *ctx, gdouble *src,     \
                                 gdouble *dst, gint count)               \
 {                                                                       \
diff --git a/gst/audioconvert/gstchannelmix.c b/gst/audioconvert/gstchannelmix.c
index 0f9b945..aac8957 100644
--- a/gst/audioconvert/gstchannelmix.c
+++ b/gst/audioconvert/gstchannelmix.c
@@ -659,7 +659,7 @@ gst_channel_mix_passthrough (AudioConvertCtx * this)
 
 /* IMPORTANT: out_data == in_data is possible, make sure to not overwrite data
  * you might need later on! */
-void
+void __attribute__((weak))
 gst_channel_mix_mix_int (AudioConvertCtx * this,
     gint32 * in_data, gint32 * out_data, gint samples)
 {
@@ -698,7 +698,7 @@ gst_channel_mix_mix_int (AudioConvertCtx * this,
   }
 }
 
-void
+void __attribute__((weak))
 gst_channel_mix_mix_float (AudioConvertCtx * this,
     gdouble * in_data, gdouble * out_data, gint samples)
 {
-- 
1.6.3.2