[pulseaudio-commits] 12 commits - src/pulsecore src/tests

Tue Apr 29 06:03:31 PDT 2014

src/pulsecore/remap.c     |  432 ++++++++++++++++++++++++++++++++++------------
 src/pulsecore/remap.h     |   20 +-
 src/pulsecore/remap_mmx.c |   53 ++---
 src/pulsecore/remap_sse.c |   53 ++---
 src/pulsecore/resampler.c |   38 ++--
 src/tests/cpu-test.c      |   22 --
 6 files changed, 418 insertions(+), 200 deletions(-)

New commits:
commit a388b909f3a0970cf56d4007509dd1b96b4265e2
Author: Peter Meerwald <p.meerwald at bct-electronic.com>
Date:   Wed Apr 16 19:02:02 2014 +0200

    remap: Add stereo to mono and 4-channel special case remapping
    
    The generic matrix remapping is rather inefficient; special-case code
    improves performance by 3x easily.
    
    v4: split into s16 and float code, 4-channel remapping
    v3: fix remap_mono_to_stereo_c(), use assignment
    v2: use consistent array addressing
    
    on Intel Core i7-870 @ 2.93 GHz (GCC 4.6, 64-bit):
    
    Checking special remap (float, mono->stereo)
    func: 70392 usec (avg: 703.92, min = 583, max = 1879, stddev = 295.192).
    orig: 193042 usec (avg: 1930.42, min = 1457, max = 2269, stddev = 89.9045).
    Checking special remap (float, mono->4-channel)
    func: 118408 usec (avg: 1184.08, min = 1151, max = 1454, stddev = 57.1244).
    orig: 380074 usec (avg: 3800.74, min = 3740, max = 4180, stddev = 96.3389).
    Checking special remap (s16, mono->stereo)
    func: 60574 usec (avg: 605.74, min = 582, max = 659, stddev = 20.7681).
    orig: 188262 usec (avg: 1882.62, min = 1804, max = 2167, stddev = 79.17).
    Checking special remap (s16, mono->4-channel)
    func: 120331 usec (avg: 1203.31, min = 1151, max = 1429, stddev = 55.2863).
    orig: 376028 usec (avg: 3760.28, min = 3609, max = 4096, stddev = 122.043).
    Checking special remap (float, stereo->mono)
    func: 61408 usec (avg: 614.08, min = 580, max = 867, stddev = 50.933).
    orig: 186484 usec (avg: 1864.84, min = 1808, max = 2121, stddev = 65.3967).
    Checking special remap (float, 4-channel->mono)
    func: 118101 usec (avg: 1181.01, min = 1157, max = 1383, stddev = 36.4474).
    orig: 365191 usec (avg: 3651.91, min = 3540, max = 4083, stddev = 117.509).
    Checking special remap (s16, stereo->mono)
    func: 82908 usec (avg: 829.08, min = 795, max = 953, stddev = 33.3409).
    orig: 182565 usec (avg: 1825.65, min = 1774, max = 2117, stddev = 65.5401).
    Checking special remap (s16, 4-channel->mono)
    func: 132025 usec (avg: 1320.25, min = 1284, max = 1509, stddev = 47.0133).
    orig: 363347 usec (avg: 3633.47, min = 3560, max = 4012, stddev = 111.259).
    
    on ARM Cortex-A8 (TI OMAP3 DM3730 @ 1GHz) (Linaro GCC 4.6):
    
    Checking special remap (float, mono->stereo)
    func: 1213562 usec (avg: 12135.6, min = 4669, max = 16266, stddev = 2067.64).
    orig: 9251927 usec (avg: 92519.3, min = 87372, max = 134216, stddev = 5965.79).
    Checking special remap (float, mono->4-channel)
    func: 2479550 usec (avg: 24795.5, min = 7507, max = 29358, stddev = 2690.16).
    orig: 13186133 usec (avg: 131861, min = 119843, max = 263855, stddev = 27309).
    Checking special remap (s16, mono->stereo)
    func: 471894 usec (avg: 4718.94, min = 4058, max = 9583, stddev = 1302.7).
    orig: 1673826 usec (avg: 16738.3, min = 14679, max = 31342, stddev = 2271.67).
    Checking special remap (s16, mono->4-channel)
    func: 869508 usec (avg: 8695.08, min = 7019, max = 19165, stddev = 1866.94).
    orig: 3317020 usec (avg: 33170.2, min = 29327, max = 47577, stddev = 2029.11).
    Checking special remap (float, stereo->mono)
    func: 4405182 usec (avg: 44051.8, min = 41443, max = 77912, stddev = 4160.54).
    orig: 13245064 usec (avg: 132451, min = 125244, max = 182282, stddev = 8543.93).
    Checking special remap (float, 4-channel->mono)
    func: 8607974 usec (avg: 86079.7, min = 81909, max = 116608, stddev = 4311.52).
    orig: 26326036 usec (avg: 263260, min = 255097, max = 312928, stddev = 10111.5).
    Checking special remap (s16, stereo->mono)
    func: 1209135 usec (avg: 12091.4, min = 10742, max = 16632, stddev = 1633.88).
    orig: 3081515 usec (avg: 30815.2, min = 27008, max = 50537, stddev = 3124.35).
    Checking special remap (s16, 4-channel->mono)
    func: 1653868 usec (avg: 16538.7, min = 14648, max = 20721, stddev = 1834.52).
    orig: 6017854 usec (avg: 60178.5, min = 56061, max = 89569, stddev = 4052.86).
    
    benchmark code will be posted as follow-up patches
    
    Signed-off-by: Peter Meerwald <pmeerw at pmeerw.net>

diff --git a/src/pulsecore/remap.c b/src/pulsecore/remap.c
index 80194a4..37213d5 100644
--- a/src/pulsecore/remap.c
+++ b/src/pulsecore/remap.c
@@ -70,7 +70,116 @@ static void remap_mono_to_stereo_float32ne_c(pa_remap_t *m, float *dst, const fl
     }
 }
 
+static void remap_stereo_to_mono_s16ne_c(pa_remap_t *m, int16_t *dst, const int16_t *src, unsigned n) {
+    unsigned i;
+
+    for (i = n >> 2; i > 0; i--) {
+        dst[0] = (src[0] + src[1])/2;
+        dst[1] = (src[2] + src[3])/2;
+        dst[2] = (src[4] + src[5])/2;
+        dst[3] = (src[6] + src[7])/2;
+        src += 8;
+        dst += 4;
+    }
+    for (i = n & 3; i; i--) {
+        dst[0] = (src[0] + src[1])/2;
+        src += 2;
+        dst += 1;
+    }
+}
+
+static void remap_stereo_to_mono_float32ne_c(pa_remap_t *m, float *dst, const float *src, unsigned n) {
+    unsigned i;
+
+    for (i = n >> 2; i > 0; i--) {
+        dst[0] = (src[0] + src[1])*0.5f;
+        dst[1] = (src[2] + src[3])*0.5f;
+        dst[2] = (src[4] + src[5])*0.5f;
+        dst[3] = (src[6] + src[7])*0.5f;
+        src += 8;
+        dst += 4;
+    }
+    for (i = n & 3; i; i--) {
+        dst[0] = (src[0] + src[1])*0.5f;
+        src += 2;
+        dst += 1;
+    }
+}
+
+static void remap_mono_to_ch4_s16ne_c(pa_remap_t *m, int16_t *dst, const int16_t *src, unsigned n) {
+    unsigned i;
+
+    for (i = n >> 2; i; i--) {
+        dst[0] = dst[1] = dst[2] = dst[3] = src[0];
+        dst[4] = dst[5] = dst[6] = dst[7] = src[1];
+        dst[8] = dst[9] = dst[10] = dst[11] = src[2];
+        dst[12] = dst[13] = dst[14] = dst[15] = src[3];
+        src += 4;
+        dst += 16;
+    }
+    for (i = n & 3; i; i--) {
+        dst[0] = dst[1] = dst[2] = dst[3] = src[0];
+        src++;
+        dst += 4;
+    }
+}
+
+static void remap_mono_to_ch4_float32ne_c(pa_remap_t *m, float *dst, const float *src, unsigned n) {
+    unsigned i;
+
+    for (i = n >> 2; i; i--) {
+        dst[0] = dst[1] = dst[2] = dst[3] = src[0];
+        dst[4] = dst[5] = dst[6] = dst[7] = src[1];
+        dst[8] = dst[9] = dst[10] = dst[11] = src[2];
+        dst[12] = dst[13] = dst[14] = dst[15] = src[3];
+        src += 4;
+        dst += 16;
+    }
+    for (i = n & 3; i; i--) {
+        dst[0] = dst[1] = dst[2] = dst[3] = src[0];
+        src++;
+        dst += 4;
+    }
+}
+
+static void remap_ch4_to_mono_s16ne_c(pa_remap_t *m, int16_t *dst, const int16_t *src, unsigned n) {
+    unsigned i;
+
+    for (i = n >> 2; i > 0; i--) {
+        dst[0] = (src[0] + src[1] + src[2] + src[3])/4;
+        dst[1] = (src[4] + src[5] + src[6] + src[7])/4;
+        dst[2] = (src[8] + src[9] + src[10] + src[11])/4;
+        dst[3] = (src[12] + src[13] + src[14] + src[15])/4;
+        src += 16;
+        dst += 4;
+    }
+    for (i = n & 3; i; i--) {
+        dst[0] = (src[0] + src[1] + src[2] + src[3])/4;
+        src += 4;
+        dst += 1;
+    }
+}
+
+static void remap_ch4_to_mono_float32ne_c(pa_remap_t *m, float *dst, const float *src, unsigned n) {
+    unsigned i;
+
+    for (i = n >> 2; i > 0; i--) {
+        dst[0] = (src[0] + src[1] + src[2] + src[3])*0.25f;
+        dst[1] = (src[4] + src[5] + src[6] + src[7])*0.25f;
+        dst[2] = (src[8] + src[9] + src[10] + src[11])*0.25f;
+        dst[3] = (src[12] + src[13] + src[14] + src[15])*0.25f;
+        src += 16;
+        dst += 4;
+    }
+    for (i = n & 3; i; i--) {
+        dst[0] = (src[0] + src[1] + src[2] + src[3])*0.25f;
+        src += 4;
+        dst += 1;
+    }
+}
+
 static void remap_channels_matrix_s16ne_c(pa_remap_t *m, int16_t *dst, const int16_t *src, unsigned n) {
+
     unsigned oc, ic, i;
     unsigned n_ic, n_oc;
 
@@ -241,6 +350,26 @@ static void init_remap_c(pa_remap_t *m) {
         pa_log_info("Using mono to stereo remapping");
         pa_set_remap_func(m, (pa_do_remap_func_t) remap_mono_to_stereo_s16ne_c,
             (pa_do_remap_func_t) remap_mono_to_stereo_float32ne_c);
+    } else if (n_ic == 2 && n_oc == 1 &&
+            m->map_table_i[0][0] == 0x8000 && m->map_table_i[0][1] == 0x8000) {
+
+        pa_log_info("Using stereo to mono remapping");
+        pa_set_remap_func(m, (pa_do_remap_func_t) remap_stereo_to_mono_s16ne_c,
+            (pa_do_remap_func_t) remap_stereo_to_mono_float32ne_c);
+    } else if (n_ic == 1 && n_oc == 4 &&
+            m->map_table_i[0][0] == 0x10000 && m->map_table_i[1][0] == 0x10000 &&
+            m->map_table_i[2][0] == 0x10000 && m->map_table_i[3][0] == 0x10000) {
+
+        pa_log_info("Using mono to 4-channel remapping");
+        pa_set_remap_func(m, (pa_do_remap_func_t)remap_mono_to_ch4_s16ne_c,
+            (pa_do_remap_func_t) remap_mono_to_ch4_float32ne_c);
+    } else if (n_ic == 4 && n_oc == 1 &&
+            m->map_table_i[0][0] == 0x4000 && m->map_table_i[0][1] == 0x4000 &&
+            m->map_table_i[0][2] == 0x4000 && m->map_table_i[0][3] == 0x4000) {
+
+        pa_log_info("Using 4-channel to mono remapping");
+        pa_set_remap_func(m, (pa_do_remap_func_t) remap_ch4_to_mono_s16ne_c,
+            (pa_do_remap_func_t) remap_ch4_to_mono_float32ne_c);
     } else if (pa_setup_remap_arrange(m, arrange) && n_oc == 2) {
 
         pa_log_info("Using stereo arrange remapping");

commit 555388ebba9afdc3db92dc221593cb06235f8ff0
Author: Peter Meerwald <p.meerwald at bct-electronic.com>
Date:   Wed Apr 16 15:07:25 2014 +0200

    remap: Add special remapping case which just re-arranges channels
    
    Input channels may just be copied to output channels, no mixing; this
    avoids the generic (slow) matrix remapping code in cases where channels
    are dropped or reordered.
    This makes use of the remap struct state introduced earlier.
    
    on Intel Core i7-870 @ 2.93 GHz (GCC 4.6, 64-bit):
    
    Checking special remap (s16, stereo rearrange)
    func: 126117 usec (avg: 1261.17, min = 1150, max = 2111, stddev = 117.332).
    orig: 190509 usec (avg: 1905.09, min = 1807, max = 2402, stddev = 100.984).
    Checking special remap (float, stereo rearrange)
    func: 194329 usec (avg: 1943.29, min = 1876, max = 2127, stddev = 64.3486).
    orig: 205263 usec (avg: 2052.63, min = 2005, max = 2452, stddev = 70.177).
    Checking special remap (s16, 4-channel rearrange)
    func: 278754 usec (avg: 2787.54, min = 2719, max = 3093, stddev = 78.22).
    orig: 383885 usec (avg: 3838.85, min = 3634, max = 4121, stddev = 128.522).
    Checking special remap (float, 4-channel rearrange)
    func: 312429 usec (avg: 3124.29, min = 3017, max = 3498, stddev = 120.127).
    orig: 388198 usec (avg: 3881.98, min = 3768, max = 4655, stddev = 138.441).
    
    on ARM Cortex-A8 (TI OMAP3 DM3730 @ 1GHz) (Linaro GCC 4.6):
    
    Checking special remap (s16, stereo rearrange)
    func: 1204647 usec (avg: 12046.5, min = 10406, max = 25451, stddev = 2491.9).
    orig: 1660311 usec (avg: 16603.1, min = 14740, max = 20416, stddev = 1708.07).
    Checking special remap (float, stereo rearrange)
    func: 1391392 usec (avg: 13913.9, min = 12207, max = 28260, stddev = 2238.12).
    orig: 9246707 usec (avg: 92467.1, min = 87525, max = 125611, stddev = 5494.64).
    Checking special remap (s16, 4-channel rearrange)
    func: 2540225 usec (avg: 25402.2, min = 16937, max = 68268, stddev = 10786.7).
    orig: 3319852 usec (avg: 33198.5, min = 29571, max = 36957, stddev = 1250.39).
    Checking special remap (float, 4-channel rearrange)
    func: 3024414 usec (avg: 30244.1, min = 26153, max = 58105, stddev = 4506.01).
    orig: 12643624 usec (avg: 126436, min = 120575, max = 159088, stddev = 5519.28).
    
    benchmark code will be posted as follow-up patches
    
    Signed-off-by: Peter Meerwald <pmeerw at pmeerw.net>

diff --git a/src/pulsecore/remap.c b/src/pulsecore/remap.c
index c3de424..80194a4 100644
--- a/src/pulsecore/remap.c
+++ b/src/pulsecore/remap.c
@@ -26,6 +26,7 @@
 
 #include <string.h>
 
+#include <pulse/xmalloc.h>
 #include <pulse/sample.h>
 #include <pulse/volume.h>
 #include <pulsecore/log.h>
@@ -158,6 +159,60 @@ bool pa_setup_remap_arrange(const pa_remap_t *m, int8_t arrange[PA_CHANNELS_MAX]
     return true;
 }
 
+static void remap_arrange_stereo_s16ne_c(pa_remap_t *m, int16_t *dst, const int16_t *src, unsigned n) {
+    const unsigned n_ic = m->i_ss.channels;
+    const int8_t *arrange = m->state;
+    const int8_t ic0 = arrange[0], ic1 = arrange[1];
+
+    for (; n > 0; n--) {
+        *dst++ = (ic0 >= 0) ? *(src + ic0) : 0;
+        *dst++ = (ic1 >= 0) ? *(src + ic1) : 0;
+        src += n_ic;
+    }
+}
+
+static void remap_arrange_ch4_s16ne_c(pa_remap_t *m, int16_t *dst, const int16_t *src, unsigned n) {
+    const unsigned n_ic = m->i_ss.channels;
+    const int8_t *arrange = m->state;
+    const int8_t ic0 = arrange[0], ic1 = arrange[1],
+        ic2 = arrange[2], ic3 = arrange[3];
+
+    for (; n > 0; n--) {
+        *dst++ = (ic0 >= 0) ? *(src + ic0) : 0;
+        *dst++ = (ic1 >= 0) ? *(src + ic1) : 0;
+        *dst++ = (ic2 >= 0) ? *(src + ic2) : 0;
+        *dst++ = (ic3 >= 0) ? *(src + ic3) : 0;
+        src += n_ic;
+    }
+}
+
+static void remap_arrange_stereo_float32ne_c(pa_remap_t *m, float *dst, const float *src, unsigned n) {
+    const unsigned n_ic = m->i_ss.channels;
+    const int8_t *arrange = m->state;
+    const int ic0 = arrange[0], ic1 = arrange[1];
+
+    for (; n > 0; n--) {
+        *dst++ = (ic0 >= 0) ? *(src + ic0) : 0.0f;
+        *dst++ = (ic1 >= 0) ? *(src + ic1) : 0.0f;
+        src += n_ic;
+    }
+}
+
+static void remap_arrange_ch4_float32ne_c(pa_remap_t *m, float *dst, const float *src, unsigned n) {
+    const unsigned n_ic = m->i_ss.channels;
+    const int8_t *arrange = m->state;
+    const int ic0 = arrange[0], ic1 = arrange[1],
+        ic2 = arrange[2], ic3 = arrange[3];
+
+    for (; n > 0; n--) {
+        *dst++ = (ic0 >= 0) ? *(src + ic0) : 0.0f;
+        *dst++ = (ic1 >= 0) ? *(src + ic1) : 0.0f;
+        *dst++ = (ic2 >= 0) ? *(src + ic2) : 0.0f;
+        *dst++ = (ic3 >= 0) ? *(src + ic3) : 0.0f;
+        src += n_ic;
+    }
+}
+
 void pa_set_remap_func(pa_remap_t *m, pa_do_remap_func_t func_s16,
     pa_do_remap_func_t func_float) {
 
@@ -174,6 +229,7 @@ void pa_set_remap_func(pa_remap_t *m, pa_do_remap_func_t func_s16,
 /* set the function that will execute the remapping based on the matrices */
 static void init_remap_c(pa_remap_t *m) {
     unsigned n_oc, n_ic;
+    int8_t arrange[PA_CHANNELS_MAX];
 
     n_oc = m->o_ss.channels;
     n_ic = m->i_ss.channels;
@@ -185,9 +241,25 @@ static void init_remap_c(pa_remap_t *m) {
         pa_log_info("Using mono to stereo remapping");
         pa_set_remap_func(m, (pa_do_remap_func_t) remap_mono_to_stereo_s16ne_c,
             (pa_do_remap_func_t) remap_mono_to_stereo_float32ne_c);
+    } else if (pa_setup_remap_arrange(m, arrange) && n_oc == 2) {
+
+        pa_log_info("Using stereo arrange remapping");
+        pa_set_remap_func(m, (pa_do_remap_func_t) remap_arrange_stereo_s16ne_c,
+            (pa_do_remap_func_t) remap_arrange_stereo_float32ne_c);
+
+        /* setup state */
+        m->state = pa_xnewdup(int8_t, arrange, PA_CHANNELS_MAX);
+    } else if (pa_setup_remap_arrange(m, arrange) && n_oc == 4) {
+
+        pa_log_info("Using 4-channel arrange remapping");
+        pa_set_remap_func(m, (pa_do_remap_func_t) remap_arrange_ch4_s16ne_c,
+            (pa_do_remap_func_t) remap_arrange_ch4_float32ne_c);
+
+        /* setup state */
+        m->state = pa_xnewdup(int8_t, arrange, PA_CHANNELS_MAX);
     } else {
-        pa_log_info("Using generic matrix remapping");
 
+        pa_log_info("Using generic matrix remapping");
         pa_set_remap_func(m, (pa_do_remap_func_t) remap_channels_matrix_s16ne_c,
             (pa_do_remap_func_t) remap_channels_matrix_float32ne_c);
     }

commit e92e8b11f1f3b8c3af232388a52bbd06dbaaae7f
Author: Peter Meerwald <p.meerwald at bct-electronic.com>
Date:   Wed Apr 16 15:00:45 2014 +0200

    remap: Add (optional) state to remap struct
    
    State can be used by remap function implementations to
    speed up the remapping, e.g. by precomputing things or
    even by generating specialized code for a specific channel
    remapping task
    
    Signed-off-by: Peter Meerwald <pmeerw at pmeerw.net>

diff --git a/src/pulsecore/remap.h b/src/pulsecore/remap.h
index 157f974..66ab7bc 100644
--- a/src/pulsecore/remap.h
+++ b/src/pulsecore/remap.h
@@ -35,6 +35,7 @@ struct pa_remap {
     float map_table_f[PA_CHANNELS_MAX][PA_CHANNELS_MAX];
     int32_t map_table_i[PA_CHANNELS_MAX][PA_CHANNELS_MAX];
     pa_do_remap_func_t do_remap;
+    void *state; /* optional state information for the remap function */
 };
 
 void pa_init_remap_func(pa_remap_t *m);
diff --git a/src/pulsecore/resampler.c b/src/pulsecore/resampler.c
index 473cbd3..1153281 100644
--- a/src/pulsecore/resampler.c
+++ b/src/pulsecore/resampler.c
@@ -115,6 +115,7 @@ static int libsamplerate_init(pa_resampler*r);
 #endif
 
 static void setup_remap(const pa_resampler *r, pa_remap_t *m);
+static void free_remap(pa_remap_t *m);
 
 static int (* const init_table[])(pa_resampler*r) = {
 #ifdef HAVE_LIBSAMPLERATE
@@ -477,6 +478,8 @@ void pa_resampler_free(pa_resampler *r) {
     if (r->from_work_format_buf.memblock)
         pa_memblock_unref(r->from_work_format_buf.memblock);
 
+    free_remap(&r->remap);
+
     pa_xfree(r);
 }
 
@@ -1152,6 +1155,12 @@ static void setup_remap(const pa_resampler *r, pa_remap_t *m) {
     pa_init_remap_func(m);
 }
 
+static void free_remap(pa_remap_t *m) {
+    pa_assert(m);
+
+    pa_xfree(m->state);
+}
+
 /* check if buf's memblock is large enough to hold 'len' bytes; create a
  * new memblock if necessary and optionally preserve 'copy' data bytes */
 static void fit_buf(pa_resampler *r, pa_memchunk *buf, size_t len, size_t *size, size_t copy) {

commit 877ad8dcf8d3997f65ffe8ac4f544f6bdd8952e7
Author: Peter Meerwald <pmeerw at pmeerw.net>
Date:   Sun Apr 27 22:22:03 2014 +0200

    remap: Change remapping function argument type from void to int16_t / float as appropriate
    
    Signed-off-by: Peter Meerwald <pmeerw at pmeerw.net>

diff --git a/src/pulsecore/remap.c b/src/pulsecore/remap.c
index 6bec79f..c3de424 100644
--- a/src/pulsecore/remap.c
+++ b/src/pulsecore/remap.c
@@ -69,7 +69,7 @@ static void remap_mono_to_stereo_float32ne_c(pa_remap_t *m, float *dst, const fl
     }
 }
 
-static void remap_channels_matrix_s16ne_c(pa_remap_t *m, void *dst, const void *src, unsigned n) {
+static void remap_channels_matrix_s16ne_c(pa_remap_t *m, int16_t *dst, const int16_t *src, unsigned n) {
     unsigned oc, ic, i;
     unsigned n_ic, n_oc;
 
@@ -81,17 +81,13 @@ static void remap_channels_matrix_s16ne_c(pa_remap_t *m, void *dst, const void *
     for (oc = 0; oc < n_oc; oc++) {
 
         for (ic = 0; ic < n_ic; ic++) {
-            int16_t *d, *s;
-            int32_t vol;
-
-            vol = m->map_table_i[oc][ic];
+            int16_t *d = dst + oc;
+            const int16_t *s = src + ic;
+            int32_t vol = m->map_table_i[oc][ic];
 
             if (vol <= 0)
                 continue;
 
-            d = (int16_t *)dst + oc;
-            s = (int16_t *)src + ic;
-
             if (vol >= 0x10000) {
                 for (i = n; i > 0; i--, s += n_ic, d += n_oc)
                     *d += *s;
@@ -103,7 +99,7 @@ static void remap_channels_matrix_s16ne_c(pa_remap_t *m, void *dst, const void *
     }
 }
 
-static void remap_channels_matrix_float32ne_c(pa_remap_t *m, void *dst, const void *src, unsigned n) {
+static void remap_channels_matrix_float32ne_c(pa_remap_t *m, float *dst, const float *src, unsigned n) {
     unsigned oc, ic, i;
     unsigned n_ic, n_oc;
 
@@ -115,17 +111,13 @@ static void remap_channels_matrix_float32ne_c(pa_remap_t *m, void *dst, const vo
     for (oc = 0; oc < n_oc; oc++) {
 
         for (ic = 0; ic < n_ic; ic++) {
-            float *d, *s;
-            float vol;
-
-            vol = m->map_table_f[oc][ic];
+            float *d = dst + oc;
+            const float *s = src + ic;
+            float vol = m->map_table_f[oc][ic];
 
             if (vol <= 0.0f)
                 continue;
 
-            d = (float *)dst + oc;
-            s = (float *)src + ic;
-
             if (vol >= 1.0f) {
                 for (i = n; i > 0; i--, s += n_ic, d += n_oc)
                     *d += *s;
@@ -196,7 +188,8 @@ static void init_remap_c(pa_remap_t *m) {
     } else {
         pa_log_info("Using generic matrix remapping");
 
-        pa_set_remap_func(m, remap_channels_matrix_s16ne_c, remap_channels_matrix_float32ne_c);
+        pa_set_remap_func(m, (pa_do_remap_func_t) remap_channels_matrix_s16ne_c,
+            (pa_do_remap_func_t) remap_channels_matrix_float32ne_c);
     }
 }
 
diff --git a/src/pulsecore/remap_mmx.c b/src/pulsecore/remap_mmx.c
index bb81de0..51a6da7 100644
--- a/src/pulsecore/remap_mmx.c
+++ b/src/pulsecore/remap_mmx.c
@@ -102,7 +102,7 @@
                 " emms                          \n\t"
 
 #if defined (__i386__) || defined (__amd64__)
-static void remap_mono_to_stereo_s16ne_mmx(pa_remap_t *m, void *dst, const void *src, unsigned n) {
+static void remap_mono_to_stereo_s16ne_mmx(pa_remap_t *m, int16_t *dst, const int16_t *src, unsigned n) {
     pa_reg_x86 temp, temp2;
 
     __asm__ __volatile__ (
@@ -113,7 +113,7 @@ static void remap_mono_to_stereo_s16ne_mmx(pa_remap_t *m, void *dst, const void
     );
 }
 
-static void remap_mono_to_stereo_float32ne_mmx(pa_remap_t *m, void *dst, const void *src, unsigned n) {
+static void remap_mono_to_stereo_float32ne_mmx(pa_remap_t *m, float *dst, const float *src, unsigned n) {
     pa_reg_x86 temp, temp2;
 
     __asm__ __volatile__ (
@@ -136,7 +136,8 @@ static void init_remap_mmx(pa_remap_t *m) {
             m->map_table_i[0][0] == 0x10000 && m->map_table_i[1][0] == 0x10000) {
 
         pa_log_info("Using MMX mono to stereo remapping");
-        pa_set_remap_func(m, remap_mono_to_stereo_s16ne_mmx, remap_mono_to_stereo_float32ne_mmx);
+        pa_set_remap_func(m, (pa_do_remap_func_t) remap_mono_to_stereo_s16ne_mmx,
+            (pa_do_remap_func_t) remap_mono_to_stereo_float32ne_mmx);
     }
 }
 #endif /* defined (__i386__) || defined (__amd64__) */
diff --git a/src/pulsecore/remap_sse.c b/src/pulsecore/remap_sse.c
index 2c24c60..d502ece 100644
--- a/src/pulsecore/remap_sse.c
+++ b/src/pulsecore/remap_sse.c
@@ -101,7 +101,7 @@
                 "4:                             \n\t"
 
 #if defined (__i386__) || defined (__amd64__)
-static void remap_mono_to_stereo_s16ne_sse2(pa_remap_t *m, void *dst, const void *src, unsigned n) {
+static void remap_mono_to_stereo_s16ne_sse2(pa_remap_t *m, int16_t *dst, const int16_t *src, unsigned n) {
     pa_reg_x86 temp, temp2;
 
     __asm__ __volatile__ (
@@ -112,7 +112,7 @@ static void remap_mono_to_stereo_s16ne_sse2(pa_remap_t *m, void *dst, const void
     );
 }
 
-static void remap_mono_to_stereo_float32ne_sse2(pa_remap_t *m, void *dst, const void *src, unsigned n) {
+static void remap_mono_to_stereo_float32ne_sse2(pa_remap_t *m, float *dst, const float *src, unsigned n) {
     pa_reg_x86 temp, temp2;
 
     __asm__ __volatile__ (
@@ -135,7 +135,8 @@ static void init_remap_sse2(pa_remap_t *m) {
             m->map_table_i[0][0] == 0x10000 && m->map_table_i[1][0] == 0x10000) {
 
         pa_log_info("Using SSE2 mono to stereo remapping");
-        pa_set_remap_func(m, remap_mono_to_stereo_s16ne_sse2, remap_mono_to_stereo_float32ne_sse2);
+        pa_set_remap_func(m, (pa_do_remap_func_t) remap_mono_to_stereo_s16ne_sse2,
+            (pa_do_remap_func_t) remap_mono_to_stereo_float32ne_sse2);
     }
 }
 #endif /* defined (__i386__) || defined (__amd64__) */

commit 0967f0fcdcc0ebf53bff4d7d5d37598746938b7b
Author: Peter Meerwald <p.meerwald at bct-electronic.com>
Date:   Tue Apr 22 17:45:23 2014 +0200

    remap: Cleanup remap function selection, add pa_set_remap_func() helper
    
    Signed-off-by: Peter Meerwald <pmeerw at pmeerw.net>

diff --git a/src/pulsecore/remap.c b/src/pulsecore/remap.c
index a550b56..6bec79f 100644
--- a/src/pulsecore/remap.c
+++ b/src/pulsecore/remap.c
@@ -166,6 +166,19 @@ bool pa_setup_remap_arrange(const pa_remap_t *m, int8_t arrange[PA_CHANNELS_MAX]
     return true;
 }
 
+void pa_set_remap_func(pa_remap_t *m, pa_do_remap_func_t func_s16,
+    pa_do_remap_func_t func_float) {
+
+    pa_assert(m);
+
+    if (m->format == PA_SAMPLE_S16NE)
+        m->do_remap = func_s16;
+    else if (m->format == PA_SAMPLE_FLOAT32NE)
+        m->do_remap = func_float;
+    else
+        pa_assert_not_reached();
+}
+
 /* set the function that will execute the remapping based on the matrices */
 static void init_remap_c(pa_remap_t *m) {
     unsigned n_oc, n_ic;
@@ -178,28 +191,12 @@ static void init_remap_c(pa_remap_t *m) {
             m->map_table_i[0][0] == 0x10000 && m->map_table_i[1][0] == 0x10000) {
 
         pa_log_info("Using mono to stereo remapping");
-        switch (m->format) {
-        case PA_SAMPLE_S16NE:
-            m->do_remap = (pa_do_remap_func_t) remap_mono_to_stereo_s16ne_c;
-            break;
-        case PA_SAMPLE_FLOAT32NE:
-            m->do_remap = (pa_do_remap_func_t) remap_mono_to_stereo_float32ne_c;
-            break;
-        default:
-            pa_assert_not_reached();
-        }
+        pa_set_remap_func(m, (pa_do_remap_func_t) remap_mono_to_stereo_s16ne_c,
+            (pa_do_remap_func_t) remap_mono_to_stereo_float32ne_c);
     } else {
         pa_log_info("Using generic matrix remapping");
-        switch (m->format) {
-        case PA_SAMPLE_S16NE:
-            m->do_remap = (pa_do_remap_func_t) remap_channels_matrix_s16ne_c;
-            break;
-        case PA_SAMPLE_FLOAT32NE:
-            m->do_remap = (pa_do_remap_func_t) remap_channels_matrix_float32ne_c;
-            break;
-        default:
-            pa_assert_not_reached();
-        }
+
+        pa_set_remap_func(m, remap_channels_matrix_s16ne_c, remap_channels_matrix_float32ne_c);
     }
 }
 
diff --git a/src/pulsecore/remap.h b/src/pulsecore/remap.h
index 3bd1403..157f974 100644
--- a/src/pulsecore/remap.h
+++ b/src/pulsecore/remap.h
@@ -55,4 +55,7 @@ void pa_set_init_remap_func(pa_init_remap_func_t func);
  */
 bool pa_setup_remap_arrange(const pa_remap_t *m, int8_t arrange[PA_CHANNELS_MAX]);
 
+void pa_set_remap_func(pa_remap_t *m, pa_do_remap_func_t func_s16,
+    pa_do_remap_func_t func_float);
+
 #endif /* fooremapfoo */
diff --git a/src/pulsecore/remap_mmx.c b/src/pulsecore/remap_mmx.c
index ef8c961..bb81de0 100644
--- a/src/pulsecore/remap_mmx.c
+++ b/src/pulsecore/remap_mmx.c
@@ -136,16 +136,7 @@ static void init_remap_mmx(pa_remap_t *m) {
             m->map_table_i[0][0] == 0x10000 && m->map_table_i[1][0] == 0x10000) {
 
         pa_log_info("Using MMX mono to stereo remapping");
-        switch (m->format) {
-        case PA_SAMPLE_S16NE:
-            m->do_remap = (pa_do_remap_func_t) remap_mono_to_stereo_s16ne_mmx;
-            break;
-        case PA_SAMPLE_FLOAT32NE:
-            m->do_remap = (pa_do_remap_func_t) remap_mono_to_stereo_float32ne_mmx;
-            break;
-        default:
-            pa_assert_not_reached();
-        }
+        pa_set_remap_func(m, remap_mono_to_stereo_s16ne_mmx, remap_mono_to_stereo_float32ne_mmx);
     }
 }
 #endif /* defined (__i386__) || defined (__amd64__) */
diff --git a/src/pulsecore/remap_sse.c b/src/pulsecore/remap_sse.c
index 3d28330..2c24c60 100644
--- a/src/pulsecore/remap_sse.c
+++ b/src/pulsecore/remap_sse.c
@@ -135,16 +135,7 @@ static void init_remap_sse2(pa_remap_t *m) {
             m->map_table_i[0][0] == 0x10000 && m->map_table_i[1][0] == 0x10000) {
 
         pa_log_info("Using SSE2 mono to stereo remapping");
-        switch (m->format) {
-        case PA_SAMPLE_S16NE:
-            m->do_remap = (pa_do_remap_func_t) remap_mono_to_stereo_s16ne_sse2;
-            break;
-        case PA_SAMPLE_FLOAT32NE:
-            m->do_remap = (pa_do_remap_func_t) remap_mono_to_stereo_float32ne_sse2;
-            break;
-        default:
-            pa_assert_not_reached();
-        }
+        pa_set_remap_func(m, remap_mono_to_stereo_s16ne_sse2, remap_mono_to_stereo_float32ne_sse2);
     }
 }
 #endif /* defined (__i386__) || defined (__amd64__) */

commit 3b5868a27471ae93464527b2ab08c03eb610e14a
Author: Peter Meerwald <p.meerwald at bct-electronic.com>
Date:   Wed Apr 16 15:06:29 2014 +0200

    remap: Add helper function to setup channel arrangement information
    
    Signed-off-by: Peter Meerwald <pmeerw at pmeerw.net>

diff --git a/src/pulsecore/remap.c b/src/pulsecore/remap.c
index a08f5a6..a550b56 100644
--- a/src/pulsecore/remap.c
+++ b/src/pulsecore/remap.c
@@ -137,6 +137,35 @@ static void remap_channels_matrix_float32ne_c(pa_remap_t *m, void *dst, const vo
     }
 }
 
+bool pa_setup_remap_arrange(const pa_remap_t *m, int8_t arrange[PA_CHANNELS_MAX]) {
+    unsigned ic, oc;
+    unsigned n_ic, n_oc;
+
+    pa_assert(m);
+
+    n_ic = m->i_ss.channels;
+    n_oc = m->o_ss.channels;
+
+    for (oc = 0; oc < n_oc; oc++) {
+        arrange[oc] = -1;
+        for (ic = 0; ic < n_ic; ic++) {
+            int32_t vol = m->map_table_i[oc][ic];
+
+            /* input channel is not used */
+            if (vol == 0)
+                continue;
+
+            /* if mixing this channel, we cannot just rearrange */
+            if (vol != 0x10000 || arrange[oc] >= 0)
+                return false;
+
+            arrange[oc] = ic;
+        }
+    }
+
+    return true;
+}
+
 /* set the function that will execute the remapping based on the matrices */
 static void init_remap_c(pa_remap_t *m) {
     unsigned n_oc, n_ic;
diff --git a/src/pulsecore/remap.h b/src/pulsecore/remap.h
index fcfe682..3bd1403 100644
--- a/src/pulsecore/remap.h
+++ b/src/pulsecore/remap.h
@@ -45,4 +45,14 @@ typedef void (*pa_init_remap_func_t) (pa_remap_t *m);
 pa_init_remap_func_t pa_get_init_remap_func(void);
 void pa_set_init_remap_func(pa_init_remap_func_t func);
 
+/* Check if remapping can be performed by just copying some or all input
+ * channels' data to output channels. Returns true and a table of input
+ * channel indices, or false otherwise.
+ *
+ * The table contains an entry for each output channels. Each table entry given
+ * either the input channel index to be copied, or -1 indicating that the
+ * output channel is not used and hence zero.
+ */
+bool pa_setup_remap_arrange(const pa_remap_t *m, int8_t arrange[PA_CHANNELS_MAX]);
+
 #endif /* fooremapfoo */

commit 647d49165373790d4a49240b69a30fb23f27aa3b
Author: Peter Meerwald <p.meerwald at bct-electronic.com>
Date:   Wed Apr 16 12:00:06 2014 +0200

    remap: Use float constant instead of double
    
    Signed-off-by: Peter Meerwald <pmeerw at pmeerw.net>

diff --git a/src/pulsecore/remap.c b/src/pulsecore/remap.c
index 4108fdd..a08f5a6 100644
--- a/src/pulsecore/remap.c
+++ b/src/pulsecore/remap.c
@@ -120,13 +120,13 @@ static void remap_channels_matrix_float32ne_c(pa_remap_t *m, void *dst, const vo
 
             vol = m->map_table_f[oc][ic];
 
-            if (vol <= 0.0)
+            if (vol <= 0.0f)
                 continue;
 
             d = (float *)dst + oc;
             s = (float *)src + ic;
 
-            if (vol >= 1.0) {
+            if (vol >= 1.0f) {
                 for (i = n; i > 0; i--, s += n_ic, d += n_oc)
                     *d += *s;
             } else {

commit c0e0e7ea8c836a0c2695d658ca2abbb166101bb3
Author: Peter Meerwald <p.meerwald at bct-electronic.com>
Date:   Wed Apr 16 11:58:11 2014 +0200

    remap: Split remapping functions into s16 and float implementation
    
    The sample format is known when the remap structure is initialized,
    no runtime decision needed.
    
    Signed-off-by: Peter Meerwald <pmeerw at pmeerw.net>

diff --git a/src/pulsecore/remap.c b/src/pulsecore/remap.c
index 136e31d..4108fdd 100644
--- a/src/pulsecore/remap.c
+++ b/src/pulsecore/remap.c
@@ -33,130 +33,107 @@
 
 #include "remap.h"
 
-static void remap_mono_to_stereo_c(pa_remap_t *m, void *dst, const void *src, unsigned n) {
+static void remap_mono_to_stereo_s16ne_c(pa_remap_t *m, int16_t *dst, const int16_t *src, unsigned n) {
     unsigned i;
 
-    switch (m->format) {
-        case PA_SAMPLE_FLOAT32NE:
-        {
-            float *d, *s;
-
-            d = (float *) dst;
-            s = (float *) src;
-
-            for (i = n >> 2; i; i--) {
-                d[0] = d[1] = s[0];
-                d[2] = d[3] = s[1];
-                d[4] = d[5] = s[2];
-                d[6] = d[7] = s[3];
-                s += 4;
-                d += 8;
-            }
-            for (i = n & 3; i; i--) {
-                d[0] = d[1] = s[0];
-                s++;
-                d += 2;
-            }
-            break;
-        }
-        case PA_SAMPLE_S16NE:
-        {
-            int16_t *d, *s;
+    for (i = n >> 2; i; i--) {
+        dst[0] = dst[1] = src[0];
+        dst[2] = dst[3] = src[1];
+        dst[4] = dst[5] = src[2];
+        dst[6] = dst[7] = src[3];
+        src += 4;
+        dst += 8;
+    }
+    for (i = n & 3; i; i--) {
+        dst[0] = dst[1] = src[0];
+        src++;
+        dst += 2;
+    }
+}
 
-            d = (int16_t *) dst;
-            s = (int16_t *) src;
+static void remap_mono_to_stereo_float32ne_c(pa_remap_t *m, float *dst, const float *src, unsigned n) {
+    unsigned i;
 
-            for (i = n >> 2; i; i--) {
-                d[0] = d[1] = s[0];
-                d[2] = d[3] = s[1];
-                d[4] = d[5] = s[2];
-                d[6] = d[7] = s[3];
-                s += 4;
-                d += 8;
-            }
-            for (i = n & 3; i; i--) {
-                d[0] = d[1] = s[0];
-                s++;
-                d += 2;
-            }
-            break;
-        }
-        default:
-            pa_assert_not_reached();
+    for (i = n >> 2; i; i--) {
+        dst[0] = dst[1] = src[0];
+        dst[2] = dst[3] = src[1];
+        dst[4] = dst[5] = src[2];
+        dst[6] = dst[7] = src[3];
+        src += 4;
+        dst += 8;
+    }
+    for (i = n & 3; i; i--) {
+        dst[0] = dst[1] = src[0];
+        src++;
+        dst += 2;
     }
 }
 
-static void remap_channels_matrix_c(pa_remap_t *m, void *dst, const void *src, unsigned n) {
+static void remap_channels_matrix_s16ne_c(pa_remap_t *m, void *dst, const void *src, unsigned n) {
     unsigned oc, ic, i;
     unsigned n_ic, n_oc;
 
     n_ic = m->i_ss.channels;
     n_oc = m->o_ss.channels;
 
-    switch (m->format) {
-        case PA_SAMPLE_FLOAT32NE:
-        {
-            float *d, *s;
+    memset(dst, 0, n * sizeof(int16_t) * n_oc);
 
-            memset(dst, 0, n * sizeof(float) * n_oc);
+    for (oc = 0; oc < n_oc; oc++) {
 
-            for (oc = 0; oc < n_oc; oc++) {
-
-                for (ic = 0; ic < n_ic; ic++) {
-                    float vol;
+        for (ic = 0; ic < n_ic; ic++) {
+            int16_t *d, *s;
+            int32_t vol;
 
-                    vol = m->map_table_f[oc][ic];
+            vol = m->map_table_i[oc][ic];
 
-                    if (vol <= 0.0)
-                        continue;
+            if (vol <= 0)
+                continue;
 
-                    d = (float *)dst + oc;
-                    s = (float *)src + ic;
+            d = (int16_t *)dst + oc;
+            s = (int16_t *)src + ic;
 
-                    if (vol >= 1.0) {
-                        for (i = n; i > 0; i--, s += n_ic, d += n_oc)
-                            *d += *s;
-                    } else {
-                        for (i = n; i > 0; i--, s += n_ic, d += n_oc)
-                            *d += *s * vol;
-                    }
-                }
+            if (vol >= 0x10000) {
+                for (i = n; i > 0; i--, s += n_ic, d += n_oc)
+                    *d += *s;
+            } else {
+                for (i = n; i > 0; i--, s += n_ic, d += n_oc)
+                    *d += (int16_t) (((int32_t)*s * vol) >> 16);
             }
-
-            break;
         }
-        case PA_SAMPLE_S16NE:
-        {
-            int16_t *d, *s;
+    }
+}
+
+static void remap_channels_matrix_float32ne_c(pa_remap_t *m, void *dst, const void *src, unsigned n) {
+    unsigned oc, ic, i;
+    unsigned n_ic, n_oc;
+
+    n_ic = m->i_ss.channels;
+    n_oc = m->o_ss.channels;
 
-            memset(dst, 0, n * sizeof(int16_t) * n_oc);
+    memset(dst, 0, n * sizeof(float) * n_oc);
 
-            for (oc = 0; oc < n_oc; oc++) {
+    for (oc = 0; oc < n_oc; oc++) {
 
-                for (ic = 0; ic < n_ic; ic++) {
-                    int32_t vol;
+        for (ic = 0; ic < n_ic; ic++) {
+            float *d, *s;
+            float vol;
 
-                    vol = m->map_table_i[oc][ic];
+            vol = m->map_table_f[oc][ic];
 
-                    if (vol <= 0)
-                        continue;
+            if (vol <= 0.0)
+                continue;
 
-                    d = (int16_t *)dst + oc;
-                    s = (int16_t *)src + ic;
+            d = (float *)dst + oc;
+            s = (float *)src + ic;
 
-                    if (vol >= 0x10000) {
-                        for (i = n; i > 0; i--, s += n_ic, d += n_oc)
-                            *d += *s;
-                    } else {
-                        for (i = n; i > 0; i--, s += n_ic, d += n_oc)
-                            *d += (int16_t) (((int32_t)*s * vol) >> 16);
-                    }
-                }
+            if (vol >= 1.0) {
+                for (i = n; i > 0; i--, s += n_ic, d += n_oc)
+                    *d += *s;
+            } else {
+                for (i = n; i > 0; i--, s += n_ic, d += n_oc)
+                    *d += *s * vol;
             }
-            break;
         }
-        default:
-            pa_assert_not_reached();
     }
 }
 
@@ -170,11 +147,30 @@ static void init_remap_c(pa_remap_t *m) {
     /* find some common channel remappings, fall back to full matrix operation. */
     if (n_ic == 1 && n_oc == 2 &&
             m->map_table_i[0][0] == 0x10000 && m->map_table_i[1][0] == 0x10000) {
-        m->do_remap = (pa_do_remap_func_t) remap_mono_to_stereo_c;
+
         pa_log_info("Using mono to stereo remapping");
+        switch (m->format) {
+        case PA_SAMPLE_S16NE:
+            m->do_remap = (pa_do_remap_func_t) remap_mono_to_stereo_s16ne_c;
+            break;
+        case PA_SAMPLE_FLOAT32NE:
+            m->do_remap = (pa_do_remap_func_t) remap_mono_to_stereo_float32ne_c;
+            break;
+        default:
+            pa_assert_not_reached();
+        }
     } else {
-        m->do_remap = (pa_do_remap_func_t) remap_channels_matrix_c;
         pa_log_info("Using generic matrix remapping");
+        switch (m->format) {
+        case PA_SAMPLE_S16NE:
+            m->do_remap = (pa_do_remap_func_t) remap_channels_matrix_s16ne_c;
+            break;
+        case PA_SAMPLE_FLOAT32NE:
+            m->do_remap = (pa_do_remap_func_t) remap_channels_matrix_float32ne_c;
+            break;
+        default:
+            pa_assert_not_reached();
+        }
     }
 }
 
diff --git a/src/pulsecore/remap_mmx.c b/src/pulsecore/remap_mmx.c
index 3d49045..ef8c961 100644
--- a/src/pulsecore/remap_mmx.c
+++ b/src/pulsecore/remap_mmx.c
@@ -102,33 +102,26 @@
                 " emms                          \n\t"
 
 #if defined (__i386__) || defined (__amd64__)
-static void remap_mono_to_stereo_mmx(pa_remap_t *m, void *dst, const void *src, unsigned n) {
+static void remap_mono_to_stereo_s16ne_mmx(pa_remap_t *m, void *dst, const void *src, unsigned n) {
     pa_reg_x86 temp, temp2;
 
-    switch (m->format) {
-        case PA_SAMPLE_FLOAT32NE:
-        {
-            __asm__ __volatile__ (
-                MONO_TO_STEREO(dq,3,7) /* do doubles to quads */
-                : "+r" (dst), "+r" (src), "=&r" (temp), "=&r" (temp2)
-                : "r" ((pa_reg_x86)n)
-                : "cc"
-            );
-            break;
-        }
-        case PA_SAMPLE_S16NE:
-        {
-            __asm__ __volatile__ (
-                MONO_TO_STEREO(wd,4,15) /* do words to doubles */
-                : "+r" (dst), "+r" (src), "=&r" (temp), "=&r" (temp2)
-                : "r" ((pa_reg_x86)n)
-                : "cc"
-            );
-            break;
-        }
-        default:
-            pa_assert_not_reached();
-    }
+    __asm__ __volatile__ (
+        MONO_TO_STEREO(wd,4,15) /* do words to doubles */
+        : "+r" (dst), "+r" (src), "=&r" (temp), "=&r" (temp2)
+        : "r" ((pa_reg_x86)n)
+        : "cc"
+    );
+}
+
+static void remap_mono_to_stereo_float32ne_mmx(pa_remap_t *m, void *dst, const void *src, unsigned n) {
+    pa_reg_x86 temp, temp2;
+
+    __asm__ __volatile__ (
+        MONO_TO_STEREO(dq,3,7) /* do doubles to quads */
+        : "+r" (dst), "+r" (src), "=&r" (temp), "=&r" (temp2)
+        : "r" ((pa_reg_x86)n)
+        : "cc"
+    );
 }
 
 /* set the function that will execute the remapping based on the matrices */
@@ -141,8 +134,18 @@ static void init_remap_mmx(pa_remap_t *m) {
     /* find some common channel remappings, fall back to full matrix operation. */
     if (n_ic == 1 && n_oc == 2 &&
             m->map_table_i[0][0] == 0x10000 && m->map_table_i[1][0] == 0x10000) {
-        m->do_remap = (pa_do_remap_func_t) remap_mono_to_stereo_mmx;
+
         pa_log_info("Using MMX mono to stereo remapping");
+        switch (m->format) {
+        case PA_SAMPLE_S16NE:
+            m->do_remap = (pa_do_remap_func_t) remap_mono_to_stereo_s16ne_mmx;
+            break;
+        case PA_SAMPLE_FLOAT32NE:
+            m->do_remap = (pa_do_remap_func_t) remap_mono_to_stereo_float32ne_mmx;
+            break;
+        default:
+            pa_assert_not_reached();
+        }
     }
 }
 #endif /* defined (__i386__) || defined (__amd64__) */
diff --git a/src/pulsecore/remap_sse.c b/src/pulsecore/remap_sse.c
index be6d3b0..3d28330 100644
--- a/src/pulsecore/remap_sse.c
+++ b/src/pulsecore/remap_sse.c
@@ -101,33 +101,26 @@
                 "4:                             \n\t"
 
 #if defined (__i386__) || defined (__amd64__)
-static void remap_mono_to_stereo_sse2(pa_remap_t *m, void *dst, const void *src, unsigned n) {
+static void remap_mono_to_stereo_s16ne_sse2(pa_remap_t *m, void *dst, const void *src, unsigned n) {
     pa_reg_x86 temp, temp2;
 
-    switch (m->format) {
-        case PA_SAMPLE_FLOAT32NE:
-        {
-            __asm__ __volatile__ (
-                MONO_TO_STEREO(dq, 4, 15) /* do doubles to quads */
-                : "+r" (dst), "+r" (src), "=&r" (temp), "=&r" (temp2)
-                : "r" ((pa_reg_x86)n)
-                : "cc"
-            );
-            break;
-        }
-        case PA_SAMPLE_S16NE:
-        {
-            __asm__ __volatile__ (
-                MONO_TO_STEREO(wd, 5, 31) /* do words to doubles */
-                : "+r" (dst), "+r" (src), "=&r" (temp), "=&r" (temp2)
-                : "r" ((pa_reg_x86)n)
-                : "cc"
-            );
-            break;
-        }
-        default:
-            pa_assert_not_reached();
-    }
+    __asm__ __volatile__ (
+        MONO_TO_STEREO(wd, 5, 31) /* do words to doubles */
+        : "+r" (dst), "+r" (src), "=&r" (temp), "=&r" (temp2)
+        : "r" ((pa_reg_x86)n)
+        : "cc"
+    );
+}
+
+static void remap_mono_to_stereo_float32ne_sse2(pa_remap_t *m, void *dst, const void *src, unsigned n) {
+    pa_reg_x86 temp, temp2;
+
+    __asm__ __volatile__ (
+        MONO_TO_STEREO(dq, 4, 15) /* do doubles to quads */
+        : "+r" (dst), "+r" (src), "=&r" (temp), "=&r" (temp2)
+        : "r" ((pa_reg_x86)n)
+        : "cc"
+    );
 }
 
 /* set the function that will execute the remapping based on the matrices */
@@ -140,8 +133,18 @@ static void init_remap_sse2(pa_remap_t *m) {
     /* find some common channel remappings, fall back to full matrix operation. */
     if (n_ic == 1 && n_oc == 2 &&
             m->map_table_i[0][0] == 0x10000 && m->map_table_i[1][0] == 0x10000) {
-        m->do_remap = (pa_do_remap_func_t) remap_mono_to_stereo_sse2;
+
         pa_log_info("Using SSE2 mono to stereo remapping");
+        switch (m->format) {
+        case PA_SAMPLE_S16NE:
+            m->do_remap = (pa_do_remap_func_t) remap_mono_to_stereo_s16ne_sse2;
+            break;
+        case PA_SAMPLE_FLOAT32NE:
+            m->do_remap = (pa_do_remap_func_t) remap_mono_to_stereo_float32ne_sse2;
+            break;
+        default:
+            pa_assert_not_reached();
+        }
     }
 }
 #endif /* defined (__i386__) || defined (__amd64__) */

commit 9362bdc8a1d5bd1ce213c517e1999644728193a2
Author: Peter Meerwald <p.meerwald at bct-electronic.com>
Date:   Wed Apr 16 11:25:58 2014 +0200

    remap: Make resampler's remap structure more self-contained
    
    Initialization of the remap structure now happens in one place
    
    Rename calc_map_table() to setup_remap(), copy sample format and
    channel specs; the remap structure is initialized when we know the
    work sample format of the resampler
    
    Signed-off-by: Peter Meerwald <pmeerw at pmeerw.net>

diff --git a/src/pulsecore/remap.c b/src/pulsecore/remap.c
index 653ee7e..136e31d 100644
--- a/src/pulsecore/remap.c
+++ b/src/pulsecore/remap.c
@@ -36,7 +36,7 @@
 static void remap_mono_to_stereo_c(pa_remap_t *m, void *dst, const void *src, unsigned n) {
     unsigned i;
 
-    switch (*m->format) {
+    switch (m->format) {
         case PA_SAMPLE_FLOAT32NE:
         {
             float *d, *s;
@@ -90,10 +90,10 @@ static void remap_channels_matrix_c(pa_remap_t *m, void *dst, const void *src, u
     unsigned oc, ic, i;
     unsigned n_ic, n_oc;
 
-    n_ic = m->i_ss->channels;
-    n_oc = m->o_ss->channels;
+    n_ic = m->i_ss.channels;
+    n_oc = m->o_ss.channels;
 
-    switch (*m->format) {
+    switch (m->format) {
         case PA_SAMPLE_FLOAT32NE:
         {
             float *d, *s;
@@ -164,8 +164,8 @@ static void remap_channels_matrix_c(pa_remap_t *m, void *dst, const void *src, u
 static void init_remap_c(pa_remap_t *m) {
     unsigned n_oc, n_ic;
 
-    n_oc = m->o_ss->channels;
-    n_ic = m->i_ss->channels;
+    n_oc = m->o_ss.channels;
+    n_ic = m->i_ss.channels;
 
     /* find some common channel remappings, fall back to full matrix operation. */
     if (n_ic == 1 && n_oc == 2 &&
diff --git a/src/pulsecore/remap.h b/src/pulsecore/remap.h
index 6411a46..fcfe682 100644
--- a/src/pulsecore/remap.h
+++ b/src/pulsecore/remap.h
@@ -30,8 +30,8 @@ typedef struct pa_remap pa_remap_t;
 typedef void (*pa_do_remap_func_t) (pa_remap_t *m, void *d, const void *s, unsigned n);
 
 struct pa_remap {
-    pa_sample_format_t *format;
-    pa_sample_spec *i_ss, *o_ss;
+    pa_sample_format_t format;
+    pa_sample_spec i_ss, o_ss;
     float map_table_f[PA_CHANNELS_MAX][PA_CHANNELS_MAX];
     int32_t map_table_i[PA_CHANNELS_MAX][PA_CHANNELS_MAX];
     pa_do_remap_func_t do_remap;
diff --git a/src/pulsecore/remap_mmx.c b/src/pulsecore/remap_mmx.c
index bf611a1..3d49045 100644
--- a/src/pulsecore/remap_mmx.c
+++ b/src/pulsecore/remap_mmx.c
@@ -105,7 +105,7 @@
 static void remap_mono_to_stereo_mmx(pa_remap_t *m, void *dst, const void *src, unsigned n) {
     pa_reg_x86 temp, temp2;
 
-    switch (*m->format) {
+    switch (m->format) {
         case PA_SAMPLE_FLOAT32NE:
         {
             __asm__ __volatile__ (
@@ -135,8 +135,8 @@ static void remap_mono_to_stereo_mmx(pa_remap_t *m, void *dst, const void *src,
 static void init_remap_mmx(pa_remap_t *m) {
     unsigned n_oc, n_ic;
 
-    n_oc = m->o_ss->channels;
-    n_ic = m->i_ss->channels;
+    n_oc = m->o_ss.channels;
+    n_ic = m->i_ss.channels;
 
     /* find some common channel remappings, fall back to full matrix operation. */
     if (n_ic == 1 && n_oc == 2 &&
diff --git a/src/pulsecore/remap_sse.c b/src/pulsecore/remap_sse.c
index f43ecb7..be6d3b0 100644
--- a/src/pulsecore/remap_sse.c
+++ b/src/pulsecore/remap_sse.c
@@ -104,7 +104,7 @@
 static void remap_mono_to_stereo_sse2(pa_remap_t *m, void *dst, const void *src, unsigned n) {
     pa_reg_x86 temp, temp2;
 
-    switch (*m->format) {
+    switch (m->format) {
         case PA_SAMPLE_FLOAT32NE:
         {
             __asm__ __volatile__ (
@@ -134,8 +134,8 @@ static void remap_mono_to_stereo_sse2(pa_remap_t *m, void *dst, const void *src,
 static void init_remap_sse2(pa_remap_t *m) {
     unsigned n_oc, n_ic;
 
-    n_oc = m->o_ss->channels;
-    n_ic = m->i_ss->channels;
+    n_oc = m->o_ss.channels;
+    n_ic = m->i_ss.channels;
 
     /* find some common channel remappings, fall back to full matrix operation. */
     if (n_ic == 1 && n_oc == 2 &&
diff --git a/src/pulsecore/resampler.c b/src/pulsecore/resampler.c
index cbe5e52..473cbd3 100644
--- a/src/pulsecore/resampler.c
+++ b/src/pulsecore/resampler.c
@@ -114,7 +114,7 @@ static int peaks_init(pa_resampler*r);
 static int libsamplerate_init(pa_resampler*r);
 #endif
 
-static void calc_map_table(const pa_resampler *r, pa_remap_t *m);
+static void setup_remap(const pa_resampler *r, pa_remap_t *m);
 
 static int (* const init_table[])(pa_resampler*r) = {
 #ifdef HAVE_LIBSAMPLERATE
@@ -378,11 +378,6 @@ pa_resampler* pa_resampler_new(
     r->i_ss = *a;
     r->o_ss = *b;
 
-    /* set up the remap structure */
-    r->remap.i_ss = &r->i_ss;
-    r->remap.o_ss = &r->o_ss;
-    r->remap.format = &r->work_format;
-
     if (am)
         r->i_cm = *am;
     else if (!pa_channel_map_init_auto(&r->i_cm, r->i_ss.channels, PA_CHANNEL_MAP_DEFAULT))
@@ -396,10 +391,8 @@ pa_resampler* pa_resampler_new(
     r->i_fz = pa_frame_size(a);
     r->o_fz = pa_frame_size(b);
 
-    /* compute channel remap table if needed */
-    if ((r->map_required = (r->i_ss.channels != r->o_ss.channels || (!(r->flags & PA_RESAMPLER_NO_REMAP) &&
-        !pa_channel_map_equal(&r->i_cm, &r->o_cm)))))
-        calc_map_table(r, &r->remap);
+    r->map_required = (r->i_ss.channels != r->o_ss.channels || (!(r->flags & PA_RESAMPLER_NO_REMAP) &&
+        !pa_channel_map_equal(&r->i_cm, &r->o_cm)));
 
     r->work_format = pa_resampler_choose_work_format(method, a->format, b->format, r->map_required);
     r->w_sz = pa_sample_size_of_format(r->work_format);
@@ -451,6 +444,10 @@ pa_resampler* pa_resampler_new(
                  pa_sample_format_to_string(b->format), pa_sample_format_to_string(r->work_format));
     pa_log_debug("  channels %d -> %d (resampling %d)", a->channels, b->channels, r->work_channels);
 
+    /* set up the remap structure */
+    if (r->map_required)
+        setup_remap(r, &r->remap);
+
     /* initialize implementation */
     if (init_table[method](r) < 0)
         goto fail;
@@ -788,7 +785,7 @@ static int front_rear_side(pa_channel_position_t p) {
     return ON_OTHER;
 }
 
-static void calc_map_table(const pa_resampler *r, pa_remap_t *m) {
+static void setup_remap(const pa_resampler *r, pa_remap_t *m) {
     unsigned oc, ic;
     unsigned n_oc, n_ic;
     bool ic_connected[PA_CHANNELS_MAX];
@@ -802,6 +799,10 @@ static void calc_map_table(const pa_resampler *r, pa_remap_t *m) {
     n_oc = r->o_ss.channels;
     n_ic = r->i_ss.channels;
 
+    m->format = r->work_format;
+    m->i_ss = r->i_ss;
+    m->o_ss = r->o_ss;
+
     memset(m->map_table_f, 0, sizeof(m->map_table_f));
     memset(m->map_table_i, 0, sizeof(m->map_table_i));
 
diff --git a/src/tests/cpu-test.c b/src/tests/cpu-test.c
index c57a375..96137c7 100644
--- a/src/tests/cpu-test.c
+++ b/src/tests/cpu-test.c
@@ -549,17 +549,12 @@ static void remap_test_mono_stereo_float(
         pa_init_remap_func_t init_func,
         pa_init_remap_func_t orig_init_func) {
 
-    pa_sample_format_t sf;
     pa_remap_t remap;
-    pa_sample_spec iss, oss;
     pa_do_remap_func_t orig_func, func;
 
-    iss.format = oss.format = sf = PA_SAMPLE_FLOAT32NE;
-    iss.channels = 1;
-    oss.channels = 2;
-    remap.format = &sf;
-    remap.i_ss = &iss;
-    remap.o_ss = &oss;
+    remap.format = PA_SAMPLE_FLOAT32NE;
+    remap.i_ss.channels = 1;
+    remap.o_ss.channels = 2;
     remap.map_table_f[0][0] = 1.0;
     remap.map_table_f[1][0] = 1.0;
     remap.map_table_i[0][0] = 0x10000;
@@ -588,17 +583,12 @@ static void remap_test_mono_stereo_s16(
         pa_init_remap_func_t init_func,
         pa_init_remap_func_t orig_init_func) {
 
-    pa_sample_format_t sf;
     pa_remap_t remap;
-    pa_sample_spec iss, oss;
     pa_do_remap_func_t orig_func, func;
 
-    iss.format = oss.format = sf = PA_SAMPLE_S16NE;
-    iss.channels = 1;
-    oss.channels = 2;
-    remap.format = &sf;
-    remap.i_ss = &iss;
-    remap.o_ss = &oss;
+    remap.format = PA_SAMPLE_S16NE;
+    remap.i_ss.channels = 1;
+    remap.o_ss.channels = 2;
     remap.map_table_f[0][0] = 1.0;
     remap.map_table_f[1][0] = 1.0;
     remap.map_table_i[0][0] = 0x10000;

commit 937b4175c2aadf4076182126c8bb2499d347a30e
Author: Peter Meerwald <p.meerwald at bct-electronic.com>
Date:   Wed Apr 16 10:48:21 2014 +0200

    remap: Refactor channel remap table computation calc_map_table()
    
    Make the resampler argument const, and pass explicit remap
    structure argument
    
    Signed-off-by: Peter Meerwald <pmeerw at pmeerw.net>

diff --git a/src/pulsecore/resampler.c b/src/pulsecore/resampler.c
index 334c24b..cbe5e52 100644
--- a/src/pulsecore/resampler.c
+++ b/src/pulsecore/resampler.c
@@ -114,7 +114,7 @@ static int peaks_init(pa_resampler*r);
 static int libsamplerate_init(pa_resampler*r);
 #endif
 
-static void calc_map_table(pa_resampler *r);
+static void calc_map_table(const pa_resampler *r, pa_remap_t *m);
 
 static int (* const init_table[])(pa_resampler*r) = {
 #ifdef HAVE_LIBSAMPLERATE
@@ -396,7 +396,10 @@ pa_resampler* pa_resampler_new(
     r->i_fz = pa_frame_size(a);
     r->o_fz = pa_frame_size(b);
 
-    calc_map_table(r);
+    /* compute channel remap table if needed */
+    if ((r->map_required = (r->i_ss.channels != r->o_ss.channels || (!(r->flags & PA_RESAMPLER_NO_REMAP) &&
+        !pa_channel_map_equal(&r->i_cm, &r->o_cm)))))
+        calc_map_table(r, &r->remap);
 
     r->work_format = pa_resampler_choose_work_format(method, a->format, b->format, r->map_required);
     r->w_sz = pa_sample_size_of_format(r->work_format);
@@ -785,21 +788,16 @@ static int front_rear_side(pa_channel_position_t p) {
     return ON_OTHER;
 }
 
-static void calc_map_table(pa_resampler *r) {
+static void calc_map_table(const pa_resampler *r, pa_remap_t *m) {
     unsigned oc, ic;
     unsigned n_oc, n_ic;
     bool ic_connected[PA_CHANNELS_MAX];
     bool remix;
     pa_strbuf *s;
     char *t;
-    pa_remap_t *m;
 
     pa_assert(r);
-
-    if (!(r->map_required = (r->i_ss.channels != r->o_ss.channels || (!(r->flags & PA_RESAMPLER_NO_REMAP) && !pa_channel_map_equal(&r->i_cm, &r->o_cm)))))
-        return;
-
-    m = &r->remap;
+    pa_assert(m);
 
     n_oc = r->o_ss.channels;
     n_ic = r->i_ss.channels;

commit c7190b5fd46fe7a6d5f41f2565a13134869bc181
Author: Peter Meerwald <p.meerwald at bct-electronic.com>
Date:   Wed Apr 16 11:37:50 2014 +0200

    remap: Rename pa_init_remap() to pa_init_remap_func()
    
    pa_init_remap_func() only sets the appropriate remapping function, it
    does not initialize the pa_remap struct
    
    Signed-off-by: Peter Meerwald <pmeerw at pmeerw.net>

diff --git a/src/pulsecore/remap.c b/src/pulsecore/remap.c
index da72a62..653ee7e 100644
--- a/src/pulsecore/remap.c
+++ b/src/pulsecore/remap.c
@@ -179,15 +179,15 @@ static void init_remap_c(pa_remap_t *m) {
 }
 
 /* default C implementation */
-static pa_init_remap_func_t remap_func = init_remap_c;
+static pa_init_remap_func_t init_remap_func = init_remap_c;
 
-void pa_init_remap(pa_remap_t *m) {
-    pa_assert(remap_func);
+void pa_init_remap_func(pa_remap_t *m) {
+    pa_assert(init_remap_func);
 
     m->do_remap = NULL;
 
     /* call the installed remap init function */
-    remap_func(m);
+    init_remap_func(m);
 
     if (m->do_remap == NULL) {
         /* nothing was installed, fallback to C version */
@@ -196,9 +196,9 @@ void pa_init_remap(pa_remap_t *m) {
 }
 
 pa_init_remap_func_t pa_get_init_remap_func(void) {
-    return remap_func;
+    return init_remap_func;
 }
 
 void pa_set_init_remap_func(pa_init_remap_func_t func) {
-    remap_func = func;
+    init_remap_func = func;
 }
diff --git a/src/pulsecore/remap.h b/src/pulsecore/remap.h
index 32a67cd..6411a46 100644
--- a/src/pulsecore/remap.h
+++ b/src/pulsecore/remap.h
@@ -37,7 +37,7 @@ struct pa_remap {
     pa_do_remap_func_t do_remap;
 };
 
-void pa_init_remap (pa_remap_t *m);
+void pa_init_remap_func(pa_remap_t *m);
 
 /* custom installation of init functions */
 typedef void (*pa_init_remap_func_t) (pa_remap_t *m);
diff --git a/src/pulsecore/resampler.c b/src/pulsecore/resampler.c
index 38389f3..334c24b 100644
--- a/src/pulsecore/resampler.c
+++ b/src/pulsecore/resampler.c
@@ -1150,7 +1150,7 @@ static void calc_map_table(pa_resampler *r) {
     pa_xfree(t);
 
     /* initialize the remapping function */
-    pa_init_remap(m);
+    pa_init_remap_func(m);
 }
 
 /* check if buf's memblock is large enough to hold 'len' bytes; create a

commit fde3626b776ef06028c8d78e0326901a35c9ea11
Author: Peter Meerwald <p.meerwald at bct-electronic.com>
Date:   Wed Apr 16 19:24:01 2014 +0200

    remap: Don't use PA_VOLUME_NORM
    
    remapping operates on channel contributions, not volumes
    
    Signed-off-by: Peter Meerwald <pmeerw at pmeerw.net>

diff --git a/src/pulsecore/remap.c b/src/pulsecore/remap.c
index 8585e34..da72a62 100644
--- a/src/pulsecore/remap.c
+++ b/src/pulsecore/remap.c
@@ -169,7 +169,7 @@ static void init_remap_c(pa_remap_t *m) {
 
     /* find some common channel remappings, fall back to full matrix operation. */
     if (n_ic == 1 && n_oc == 2 &&
-            m->map_table_i[0][0] == PA_VOLUME_NORM && m->map_table_i[1][0] == PA_VOLUME_NORM) {
+            m->map_table_i[0][0] == 0x10000 && m->map_table_i[1][0] == 0x10000) {
         m->do_remap = (pa_do_remap_func_t) remap_mono_to_stereo_c;
         pa_log_info("Using mono to stereo remapping");
     } else {
diff --git a/src/pulsecore/remap_mmx.c b/src/pulsecore/remap_mmx.c
index 5b3f0f9..bf611a1 100644
--- a/src/pulsecore/remap_mmx.c
+++ b/src/pulsecore/remap_mmx.c
@@ -140,7 +140,7 @@ static void init_remap_mmx(pa_remap_t *m) {
 
     /* find some common channel remappings, fall back to full matrix operation. */
     if (n_ic == 1 && n_oc == 2 &&
-            m->map_table_i[0][0] == PA_VOLUME_NORM && m->map_table_i[1][0] == PA_VOLUME_NORM) {
+            m->map_table_i[0][0] == 0x10000 && m->map_table_i[1][0] == 0x10000) {
         m->do_remap = (pa_do_remap_func_t) remap_mono_to_stereo_mmx;
         pa_log_info("Using MMX mono to stereo remapping");
     }
diff --git a/src/pulsecore/remap_sse.c b/src/pulsecore/remap_sse.c
index 8831723..f43ecb7 100644
--- a/src/pulsecore/remap_sse.c
+++ b/src/pulsecore/remap_sse.c
@@ -139,7 +139,7 @@ static void init_remap_sse2(pa_remap_t *m) {
 
     /* find some common channel remappings, fall back to full matrix operation. */
     if (n_ic == 1 && n_oc == 2 &&
-            m->map_table_i[0][0] == PA_VOLUME_NORM && m->map_table_i[1][0] == PA_VOLUME_NORM) {
+            m->map_table_i[0][0] == 0x10000 && m->map_table_i[1][0] == 0x10000) {
         m->do_remap = (pa_do_remap_func_t) remap_mono_to_stereo_sse2;
         pa_log_info("Using SSE2 mono to stereo remapping");
     }