[gst-embedded] [PATCH] audioconvert: add NEON acceleration for some conversions
Rob Clark
rob at ti.com
Mon Aug 10 07:41:20 PDT 2009
1) convert default processing functions to __attribute__((weak)) so they can be overrided with architecture specific accelerated functions (ie. NEON, MMX, Altivec, etc)
2) override gst_audio_quantize_quantize_signed_tpdf_none() to use NEON vector instructions
3) override gst_audio_convert_unpack_float_le() to use NEON vector instructions
This speeds up audioconvert ~10x, at least for the 32b float -> 16b int conversion needed to play AC-3 audio (ie. DVD's) via ALSA.
---
gst/audioconvert/Makefile.am | 1 +
gst/audioconvert/armv7.c | 209 +++++++++++++++++++++++++++++++++++
gst/audioconvert/audioconvert.c | 20 ++--
gst/audioconvert/gstaudioquantize.c | 4 +-
gst/audioconvert/gstchannelmix.c | 4 +-
5 files changed, 224 insertions(+), 14 deletions(-)
create mode 100644 gst/audioconvert/armv7.c
diff --git a/gst/audioconvert/Makefile.am b/gst/audioconvert/Makefile.am
index 94978bb..2d273db 100644
--- a/gst/audioconvert/Makefile.am
+++ b/gst/audioconvert/Makefile.am
@@ -5,6 +5,7 @@ libgstaudioconvert_la_SOURCES = \
audioconvert.c \
gstchannelmix.c \
gstaudioquantize.c \
+ armv7.c \
plugin.c
libgstaudioconvert_la_CFLAGS = $(GST_PLUGINS_BASE_CFLAGS) $(GST_BASE_CFLAGS) $(GST_CFLAGS)
diff --git a/gst/audioconvert/armv7.c b/gst/audioconvert/armv7.c
new file mode 100644
index 0000000..e39d29d
--- /dev/null
+++ b/gst/audioconvert/armv7.c
@@ -0,0 +1,209 @@
+/* GStreamer
+ *
+ * Copyright (C) 2009 Texas Instruments, Inc - http://www.ti.com/
+ *
+ * Description: NEON/VFP accelerated functions for armv7 architecture
+ * Created on: Aug 8, 2009
+ * Author: Rob Clark <rob at ti.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ */
+
+#ifdef __ARM_NEON__
+#include <arm_neon.h>
+#include <string.h>
+
+#include "audioconvert.h"
+
+
+void
+gst_audio_quantize_quantize_signed_tpdf_none (AudioConvertCtx *ctx,
+ gint32 *src, gint32 *dst, gint count)
+{
+ static guint32 state[4] = {
+ 0xdeadbeef,
+ 0x305b8cc9,
+ 0x6c46ec93,
+ 0xad13b0cd
+ };
+
+ gint scale = ctx->out_scale;
+ count *= ctx->out.channels;
+
+ if (scale > 0) {
+ guint32 mask = 0xffffffff & (0xffffffff << scale);
+ guint32 bias = (1U << (scale - 1)) >> 1;
+ gint32 dither = (1<<(scale - 1));
+
+ int32x4_t vrand;
+ uint32x4_t vstate;
+ uint32x4_t v12345;
+ int32x4_t vtmp;
+ uint32x4_t vmask;
+
+ vstate = vld1q_u32 (state);
+ v12345 = vmovq_n_u32 (12345);
+ vmask = vmovq_n_u32 (mask);
+
+ /* until we have less 4 words less to process, use vector instructions
+ * to do everything 4x at a time:
+ */
+ for (;;count-=4) {
+ int64x2_t vtmp_lo;
+ int64x2_t vtmp_hi;
+ uint32x4_t vstate2;
+ int32x2_t vrand_lo;
+ int32x2_t vrand_hi;
+
+ /* generate next eight random words: (see gst_fast_random_uint32())
+ *
+ * state = state * 1103515245 + 12345
+ */
+ vstate2 = vmulq_n_u32 (vstate, 1103515245);
+ vstate2 = vaddq_u32 (vstate2, v12345);
+ vstate = vmulq_n_u32 (vstate2, 1103515245);
+ vstate = vaddq_u32 (vstate2, v12345);
+
+ /* generate next four scaled random values:
+ *
+ * gint32 start = bias - dither;
+ * gint32 end = bias + dither - 1;
+ * gint64 tmp1 = gst_fast_random_uint32 ();
+ * gint64 tmp2 = gst_fast_random_uint32 ();
+ * rand = (gint32)(((tmp1+tmp2) * (end - start)) / (1LLU<<32) + start);
+ *
+ * need to split vstate and vstate2 into 2*2 int64x2_t and add....
+ */
+ vstate2 = vaddq_u32 (vstate, vstate2); /* tmp1+tmp2 */
+ vtmp_lo = vreinterpretq_s64_u64 ( /* * (end-start) */
+ vmull_n_u32 (vget_low_u32 (vstate2), (2*dither) - 1));
+ vtmp_hi = vreinterpretq_s64_u64 ( /* * (end-start) */
+ vmull_n_u32 (vget_high_u32 (vstate2), (2*dither) - 1));
+
+ vtmp_lo = vshrq_n_s64 (vtmp_lo, 32); /* / (1LLU<<32) */
+ vtmp_hi = vshrq_n_s64 (vtmp_hi, 32); /* / (1LLU<<32) */
+
+
+ /* now want to put vtmp_hi and vtmp_lo back together..
+ * then add 'start' (bias-dither).. which is negative..
+ */
+ vrand_lo = vmovn_s64 (vtmp_lo);
+ vrand_hi = vmovn_s64 (vtmp_hi);
+ vrand = vcombine_s32 (vrand_lo, vrand_hi);
+ vrand = vaddq_s32 (vrand, vmovq_n_s32 (bias-dither));
+
+ /* load next 4 words:
+ */
+ vtmp = vld1q_s32 (src);
+ src += 4;
+
+ /* perform saturating add of random noise... we don't want the
+ * value to wrap around:
+ *
+ * XXX I *think* vqaddq will handle saturation for underflow too..
+ */
+ vtmp = vqaddq_s32 (vtmp, vrand);
+ vtmp = vreinterpretq_s32_u32 (
+ vandq_u32 (vreinterpretq_u32_s32 (vtmp), vmask));
+
+ /* we check for less than four remaining words at the end, before
+ * we store the result back.. the assumption is that it shouldn't
+ * cause a segfault to read past the end of 'src', and there is no
+ * harm in processing a few garbage words. But we definitely don't
+ * want to write past the end of 'dst'
+ */
+ if (count<4) break;
+
+ /* store 4 words to result:
+ */
+ vst1q_s32 (dst, vtmp);
+ dst += 4;
+ }
+
+ vst1q_u32 (state, vstate);
+
+ /* at this point, we could have 0-3 result bytes in vtmp to write
+ * back out to 'dst':
+ */
+ if (count) {
+ gint32 tmpdst[4];
+ gint32 *tmpp = tmpdst;
+
+ vst1q_s32 (tmpdst, vtmp);
+
+ while (count--) {
+ *dst++ = *tmpp++;
+ }
+ }
+
+ } else {
+ memmove (dst, src, count);
+ }
+}
+
+void
+gst_audio_convert_unpack_float_le (gfloat * src, gint32 * dst, gint s, gint count)
+{
+ float32x4_t vsrc;
+ float32x4_t v05;
+ int32x4_t vdst;
+
+ v05 = vmovq_n_f32 (0.5);
+
+ for (;;count-=4) {
+
+ /* load next 4 words:
+ */
+ vsrc = vld1q_f32 ((float32_t *)src);
+ src += 4;
+
+ /* convert to int:
+ */
+ vsrc = vmulq_n_f32 (vsrc, 2147483647.0);
+ vsrc = vaddq_f32 (vsrc, v05);
+ vdst = vcvtq_s32_f32 (vsrc);
+
+ /* we check for less than four remaining words at the end, before
+ * we store the result back.. the assumption is that it shouldn't
+ * cause a segfault to read past the end of 'src', and there is no
+ * harm in processing a few garbage words. But we definitely don't
+ * want to write past the end of 'dst'
+ */
+ if (count<4) break;
+
+ /* store 4 words to result:
+ */
+ vst1q_s32 (dst, vdst);
+ dst += 4;
+ }
+
+ /* at this point, we could have 0-3 result bytes in vtmp to write
+ * back out to 'dst':
+ */
+ if (count) {
+ gint32 tmpdst[4];
+ gint32 *tmpp = tmpdst;
+
+ vst1q_s32 (tmpdst, vdst);
+
+ while (count--) {
+ *dst++ = *tmpp++;
+ }
+ }
+}
+
+
+#endif
diff --git a/gst/audioconvert/audioconvert.c b/gst/audioconvert/audioconvert.c
index 4780324..c18d217 100644
--- a/gst/audioconvert/audioconvert.c
+++ b/gst/audioconvert/audioconvert.c
@@ -38,11 +38,11 @@
* unpack code
*/
#define MAKE_UNPACK_FUNC_NAME(name) \
-audio_convert_unpack_##name
+gst_audio_convert_unpack_##name
/* unpack from integer to signed integer 32 */
#define MAKE_UNPACK_FUNC_II(name, stride, sign, READ_FUNC) \
-static void \
+void __attribute__((weak)) \
MAKE_UNPACK_FUNC_NAME (name) (guint8 *src, gint32 *dst, \
gint scale, gint count) \
{ \
@@ -54,7 +54,7 @@ MAKE_UNPACK_FUNC_NAME (name) (guint8 *src, gint32 *dst, \
/* unpack from float to signed integer 32 */
#define MAKE_UNPACK_FUNC_FI(name, type, READ_FUNC) \
-static void \
+void __attribute__((weak)) \
MAKE_UNPACK_FUNC_NAME (name) (type * src, gint32 * dst, gint s, gint count) \
{ \
gdouble temp; \
@@ -68,7 +68,7 @@ MAKE_UNPACK_FUNC_NAME (name) (type * src, gint32 * dst, gint s, gint count) \
/* unpack from float to float 64 (double) */
#define MAKE_UNPACK_FUNC_FF(name, type, FUNC) \
-static void \
+void __attribute__((weak)) \
MAKE_UNPACK_FUNC_NAME (name) (type * src, gdouble * dst, gint s, \
gint count) \
{ \
@@ -78,7 +78,7 @@ MAKE_UNPACK_FUNC_NAME (name) (type * src, gdouble * dst, gint s, \
/* unpack from int to float 64 (double) */
#define MAKE_UNPACK_FUNC_IF(name, stride, sign, READ_FUNC) \
-static void \
+void __attribute__((weak)) \
MAKE_UNPACK_FUNC_NAME (name) (guint8 * src, gdouble * dst, gint scale, \
gint count) \
{ \
@@ -158,7 +158,7 @@ audio_convert_pack_##name
/* pack from signed integer 32 to integer */
#define MAKE_PACK_FUNC_II(name, stride, sign, WRITE_FUNC) \
-static void \
+void __attribute__((weak)) \
MAKE_PACK_FUNC_NAME (name) (gint32 *src, guint8 * dst, \
gint scale, gint count) \
{ \
@@ -172,7 +172,7 @@ MAKE_PACK_FUNC_NAME (name) (gint32 *src, guint8 * dst, \
/* pack from signed integer 32 to float */
#define MAKE_PACK_FUNC_IF(name, type, FUNC) \
-static void \
+void __attribute__((weak)) \
MAKE_PACK_FUNC_NAME (name) (gint32 * src, type * dst, gint scale, \
gint count) \
{ \
@@ -182,7 +182,7 @@ MAKE_PACK_FUNC_NAME (name) (gint32 * src, type * dst, gint scale, \
/* pack from float 64 (double) to float */
#define MAKE_PACK_FUNC_FF(name, type, FUNC) \
-static void \
+void __attribute__((weak)) \
MAKE_PACK_FUNC_NAME (name) (gdouble * src, type * dst, gint s, \
gint count) \
{ \
@@ -194,7 +194,7 @@ MAKE_PACK_FUNC_NAME (name) (gdouble * src, type * dst, gint s, \
* the floats are already in the correct range. Only a cast is needed.
*/
#define MAKE_PACK_FUNC_FI_S(name, stride, WRITE_FUNC) \
-static void \
+void __attribute__((weak)) \
MAKE_PACK_FUNC_NAME (name) (gdouble * src, guint8 * dst, gint scale, \
gint count) \
{ \
@@ -212,7 +212,7 @@ MAKE_PACK_FUNC_NAME (name) (gdouble * src, guint8 * dst, gint scale, \
* and an addition of 2^(target_depth-1) to get in the correct unsigned
* range. */
#define MAKE_PACK_FUNC_FI_U(name, stride, WRITE_FUNC) \
-static void \
+void __attribute__((weak)) \
MAKE_PACK_FUNC_NAME (name) (gdouble * src, guint8 * dst, gint scale, \
gint count) \
{ \
diff --git a/gst/audioconvert/gstaudioquantize.c b/gst/audioconvert/gstaudioquantize.c
index 2155397..be959c4 100644
--- a/gst/audioconvert/gstaudioquantize.c
+++ b/gst/audioconvert/gstaudioquantize.c
@@ -46,7 +46,7 @@ gst_audio_quantize_quantize_##name
#define MAKE_QUANTIZE_FUNC_I(name, DITHER_INIT_FUNC, ADD_DITHER_FUNC, \
ROUND_FUNC) \
-static void \
+void __attribute__((weak)) \
MAKE_QUANTIZE_FUNC_NAME (name) (AudioConvertCtx *ctx, gint32 *src, \
gint32 *dst, gint count) \
{ \
@@ -86,7 +86,7 @@ MAKE_QUANTIZE_FUNC_NAME (name) (AudioConvertCtx *ctx, gint32 *src, \
#define MAKE_QUANTIZE_FUNC_F(name, DITHER_INIT_FUNC, NS_INIT_FUNC, \
ADD_NS_FUNC, ADD_DITHER_FUNC, \
UPDATE_ERROR_FUNC) \
-static void \
+void __attribute__((weak)) \
MAKE_QUANTIZE_FUNC_NAME (name) (AudioConvertCtx *ctx, gdouble *src, \
gdouble *dst, gint count) \
{ \
diff --git a/gst/audioconvert/gstchannelmix.c b/gst/audioconvert/gstchannelmix.c
index 0f9b945..aac8957 100644
--- a/gst/audioconvert/gstchannelmix.c
+++ b/gst/audioconvert/gstchannelmix.c
@@ -659,7 +659,7 @@ gst_channel_mix_passthrough (AudioConvertCtx * this)
/* IMPORTANT: out_data == in_data is possible, make sure to not overwrite data
* you might need later on! */
-void
+void __attribute__((weak))
gst_channel_mix_mix_int (AudioConvertCtx * this,
gint32 * in_data, gint32 * out_data, gint samples)
{
@@ -698,7 +698,7 @@ gst_channel_mix_mix_int (AudioConvertCtx * this,
}
}
-void
+void __attribute__((weak))
gst_channel_mix_mix_float (AudioConvertCtx * this,
gdouble * in_data, gdouble * out_data, gint samples)
{
--
1.6.3.2
More information about the Gstreamer-embedded
mailing list