[Liboil-commit] liboil/i386 liboil/i386_amd64

David Schleef ds at kemper.freedesktop.org
Thu Feb 21 18:35:46 PST 2008


 liboil/i386/Makefile.am              |    1 
 liboil/i386/mas.c                    | 1114 -----------------------------------
 liboil/i386/wavelet.c                |  727 ----------------------
 liboil/i386_amd64/Makefile.am        |    3 
 liboil/i386_amd64/add2.c             |  488 +++++++++++++++
 liboil/i386_amd64/mas.c              | 1114 +++++++++++++++++++++++++++++++++++
 liboil/i386_amd64/multiply_and_acc.c |  250 +++++++
 7 files changed, 1855 insertions(+), 1842 deletions(-)

New commits:
commit 205a167d5185b9531ed008c36bc68a9bdc4cac2c
Author: David Schleef <ds at wallace.bigkitten.com>
Date:   Thu Feb 21 18:35:42 2008 -0800

    Convert some i386 code to dual-arch

diff --git a/liboil/i386/Makefile.am b/liboil/i386/Makefile.am
index dfd07c1..6f4a41e 100644
--- a/liboil/i386/Makefile.am
+++ b/liboil/i386/Makefile.am
@@ -8,7 +8,6 @@ libi386_la_SOURCES = \
 	copy8x8_i386.c \
 	diff8x8_i386.c \
 	error8x8_i386.c \
-	mas.c \
 	md5_i386.c \
 	mult8x8_i386.c \
 	recon8x8_i386.c \
diff --git a/liboil/i386/mas.c b/liboil/i386/mas.c
deleted file mode 100644
index b8d4e00..0000000
--- a/liboil/i386/mas.c
+++ /dev/null
@@ -1,1114 +0,0 @@
-
-#include <liboil/liboilfunction.h>
-#include <liboil/liboilclasses.h>
-
-
-void
-mas10_u8_mmx (uint8_t *d, const uint8_t *s1_np9, const int16_t *s2_10,
-        const int16_t *s3_2, int n)
-{
-  int j;
-  int x;
-
-  while(n&3) {
-    x = 0;
-    for(j=0;j<10;j++){
-      x += s1_np9[j] * s2_10[j];
-    }
-    *d = CLAMP((x + s3_2[0])>>s3_2[1],0,255);
-    d++;
-    s1_np9++;
-    n--;
-  }
-
-  if (n == 0) return;
-  n>>=2;
-  __asm__ __volatile__("\n"
-      "  pxor %%mm7, %%mm7\n"
-
-      "  movd (%[s3_2]), %%mm6\n"
-
-      "  movzwl 2(%[s3_2]), %%ecx\n"
-      "  movd %%ecx, %%mm5\n"
-
-      "1:\n"
-      /* load 128 */
-      "  pshufw $0x00, %%mm6, %%mm2\n"
-
-#define LOOP(x) \
-      "  movd " #x "(%[s1_np9]), %%mm0\n" \
-      "  punpcklbw %%mm7, %%mm0\n" \
-      "  movq 2*" #x "(%[s2_10]), %%mm1\n" \
-      "  pshufw $0x00, %%mm1, %%mm1\n" \
-      "  pmullw %%mm1, %%mm0\n" \
-      "  paddw %%mm0, %%mm2\n"
-
-      LOOP(0)
-      LOOP(1)
-      LOOP(2)
-      LOOP(3)
-      LOOP(4)
-      LOOP(5)
-      LOOP(6)
-      LOOP(7)
-      LOOP(8)
-      LOOP(9)
-#undef LOOP
-
-      "  psraw %%mm5, %%mm2\n"
-      "  pmaxsw %%mm7, %%mm2\n"
-      "  packuswb %%mm2, %%mm2\n"
-      "  movd %%mm2, 0(%[d])\n"
-      "  addl $4, %[d]\n"
-      "  addl $4, %[s1_np9]\n"
-      "  decl %[n]\n"
-      "  jnz 1b\n"
-      "  emms\n"
-      : [d] "+r" (d),
-        [s1_np9] "+r" (s1_np9),
-        [n] "+m" (n)
-      : [s2_10] "r" (s2_10),
-        [s3_2] "r" (s3_2)
-      : "ecx");
-}
-OIL_DEFINE_IMPL_FULL (mas10_u8_mmx, mas10_u8_l15, OIL_IMPL_FLAG_MMX|OIL_IMPL_FLAG_MMXEXT);
-
-void
-mas10_u8_mmx_2 (uint8_t *d, const uint8_t *s1_np9, const int16_t *s2_10,
-        const int16_t *s3_2, int n)
-{
-  int j;
-  int x;
-  int16_t coeff[4*10];
-  int16_t *ptr;
-
-  ptr = coeff;
-
-  while(n&3) {
-    x = 0;
-    for(j=0;j<10;j++){
-      x += s1_np9[j] * s2_10[j];
-    }
-    *d = CLAMP((x + s3_2[0])>>s3_2[1],0,255);
-    d++;
-    s1_np9++;
-    n--;
-  }
-
-  for(j=0;j<10;j++){
-    ptr[4*j + 0] = s2_10[j];
-    ptr[4*j + 1] = s2_10[j];
-    ptr[4*j + 2] = s2_10[j];
-    ptr[4*j + 3] = s2_10[j];
-  }
-
-  if (n == 0) return;
-  n>>=2;
-  __asm__ __volatile__("\n"
-      "  pxor %%mm7, %%mm7\n"
-
-      "  movd (%[s3_2]), %%mm6\n"
-
-      "  movzwl 2(%[s3_2]), %%ecx\n"
-      "  movd %%ecx, %%mm5\n"
-
-      "1:\n"
-      /* load 128 */
-      "  pshufw $0x00, %%mm6, %%mm2\n"
-
-#define LOOP(x) \
-      "  movd " #x "(%[s1_np9]), %%mm0\n" \
-      "  punpcklbw %%mm7, %%mm0\n" \
-      "  pmullw 8*" #x "(%[coeff]), %%mm0\n" \
-      "  paddw %%mm0, %%mm2\n"
-
-      LOOP(0)
-      LOOP(1)
-      LOOP(2)
-      LOOP(3)
-      LOOP(4)
-      LOOP(5)
-      LOOP(6)
-      LOOP(7)
-      LOOP(8)
-      LOOP(9)
-#undef LOOP
-
-      "  psraw %%mm5, %%mm2\n"
-      "  pmaxsw %%mm7, %%mm2\n"
-      "  packuswb %%mm2, %%mm2\n"
-      "  movd %%mm2, 0(%[d])\n"
-      "  addl $4, %[d]\n"
-      "  addl $4, %[s1_np9]\n"
-      "  decl %[n]\n"
-      "  jnz 1b\n"
-      "  emms\n"
-      : [d] "+r" (d),
-        [s1_np9] "+r" (s1_np9),
-        [n] "+m" (n)
-      : [coeff] "r" (ptr),
-        [s3_2] "r" (s3_2)
-      : "ecx");
-}
-OIL_DEFINE_IMPL_FULL (mas10_u8_mmx_2, mas10_u8_l15, OIL_IMPL_FLAG_MMX|OIL_IMPL_FLAG_MMXEXT);
-
-void
-mas10_u8_mmx_3 (uint8_t *d, const uint8_t *s1_np9, const int16_t *s2_10,
-        const int16_t *s3_2, int n)
-{
-  int j;
-  int x;
-
-  while(n&3) {
-    x = 0;
-    for(j=0;j<10;j++){
-      x += s1_np9[j] * s2_10[j];
-    }
-    *d = CLAMP((x + s3_2[0])>>s3_2[1],0,255);
-    d++;
-    s1_np9++;
-    n--;
-  }
-
-  if (n == 0) return;
-  n>>=2;
-  __asm__ __volatile__("\n"
-      "  pxor %%mm7, %%mm7\n"
-
-      "  movd (%[s3_2]), %%mm6\n"
-
-      "  movzwl 2(%[s3_2]), %%ecx\n"
-      "  movd %%ecx, %%mm5\n"
-
-      "  movq 0(%[s2_10]), %%mm3\n"
-      "  movq 8(%[s2_10]), %%mm4\n"
-
-      "1:\n"
-      /* load 128 */
-      "  pshufw $0x00, %%mm6, %%mm2\n"
-
-#define LOOP(x) \
-      "  movd " #x "(%[s1_np9]), %%mm0\n" \
-      "  punpcklbw %%mm7, %%mm0\n" \
-      "  movq 2*" #x "(%[s2_10]), %%mm1\n" \
-      "  pshufw $0x00, %%mm1, %%mm1\n" \
-      "  pmullw %%mm1, %%mm0\n" \
-      "  paddw %%mm0, %%mm2\n"
-
-      //LOOP(0)
-      "  movd 0(%[s1_np9]), %%mm0\n"
-      "  punpcklbw %%mm7, %%mm0\n"
-      "  pshufw $0x00, %%mm3, %%mm1\n"
-      "  pmullw %%mm1, %%mm0\n"
-      "  paddw %%mm0, %%mm2\n"
-
-      //LOOP(1)
-      "  movd 1(%[s1_np9]), %%mm0\n"
-      "  punpcklbw %%mm7, %%mm0\n"
-      "  pshufw $0x55*1, %%mm3, %%mm1\n"
-      "  pmullw %%mm1, %%mm0\n"
-      "  paddw %%mm0, %%mm2\n"
-
-      //LOOP(2)
-      "  movd 2(%[s1_np9]), %%mm0\n"
-      "  punpcklbw %%mm7, %%mm0\n"
-      "  pshufw $0x55*2, %%mm3, %%mm1\n"
-      "  pmullw %%mm1, %%mm0\n"
-      "  paddw %%mm0, %%mm2\n"
-
-      //LOOP(3)
-      "  movd 3(%[s1_np9]), %%mm0\n"
-      "  punpcklbw %%mm7, %%mm0\n"
-      "  pshufw $0x55*3, %%mm3, %%mm1\n"
-      "  pmullw %%mm1, %%mm0\n"
-      "  paddw %%mm0, %%mm2\n"
-
-      //LOOP(4)
-      "  movd 4(%[s1_np9]), %%mm0\n"
-      "  punpcklbw %%mm7, %%mm0\n"
-      "  pshufw $0x00, %%mm4, %%mm1\n"
-      "  pmullw %%mm1, %%mm0\n"
-      "  paddw %%mm0, %%mm2\n"
-
-      //LOOP(5)
-      "  movd 5(%[s1_np9]), %%mm0\n"
-      "  punpcklbw %%mm7, %%mm0\n"
-      "  pshufw $0x55*1, %%mm4, %%mm1\n"
-      "  pmullw %%mm1, %%mm0\n"
-      "  paddw %%mm0, %%mm2\n"
-
-      //LOOP(6)
-      "  movd 6(%[s1_np9]), %%mm0\n"
-      "  punpcklbw %%mm7, %%mm0\n"
-      "  pshufw $0x55*2, %%mm4, %%mm1\n"
-      "  pmullw %%mm1, %%mm0\n"
-      "  paddw %%mm0, %%mm2\n"
-
-      //LOOP(7)
-      "  movd 7(%[s1_np9]), %%mm0\n"
-      "  punpcklbw %%mm7, %%mm0\n"
-      "  pshufw $0x55*3, %%mm4, %%mm1\n"
-      "  pmullw %%mm1, %%mm0\n"
-      "  paddw %%mm0, %%mm2\n"
-
-      LOOP(8)
-      LOOP(9)
-#undef LOOP
-
-      "  psraw %%mm5, %%mm2\n"
-      "  pmaxsw %%mm7, %%mm2\n"
-      "  packuswb %%mm2, %%mm2\n"
-      "  movd %%mm2, 0(%[d])\n"
-      "  addl $4, %[d]\n"
-      "  addl $4, %[s1_np9]\n"
-      "  decl %[n]\n"
-      "  jnz 1b\n"
-      "  emms\n"
-      : [d] "+r" (d),
-        [s1_np9] "+r" (s1_np9),
-        [n] "+m" (n)
-      : [s2_10] "r" (s2_10),
-        [s3_2] "r" (s3_2)
-      : "ecx");
-}
-OIL_DEFINE_IMPL_FULL (mas10_u8_mmx_3, mas10_u8_l15, OIL_IMPL_FLAG_MMX|OIL_IMPL_FLAG_MMXEXT);
-
-void
-mas10_u8_mmx_4 (uint8_t *d, const uint8_t *s1_np9, const int16_t *s2_10,
-        const int16_t *s3_2, int n)
-{
-  if (n == 0) return;
-  __asm__ __volatile__("\n"
-      "  pxor %%mm7, %%mm7\n"
-
-      "  movzwl 0(%[s3_2]), %%ecx\n"
-      "  movd %%ecx, %%mm6\n"
-
-      "  movzwl 2(%[s3_2]), %%ecx\n"
-      "  movd %%ecx, %%mm5\n"
-
-      "1:\n"
-      "  movd 0(%[s1_np9]), %%mm0\n"
-      "  punpcklbw %%mm7, %%mm0\n"
-      "  pmaddwd 0(%[s2_10]), %%mm0\n"
-
-      "  movd 4(%[s1_np9]), %%mm1\n"
-      "  punpcklbw %%mm7, %%mm1\n"
-      "  pmaddwd 8(%[s2_10]), %%mm1\n"
-
-      "  movd 8(%[s1_np9]), %%mm2\n"
-      "  punpcklbw %%mm7, %%mm2\n"
-      "  pmaddwd 16(%[s2_10]), %%mm2\n"
-
-      "  paddd %%mm1, %%mm0\n"
-      "  movq %%mm0, %%mm1\n"
-      "  psrlq $32, %%mm0\n"
-      "  paddd %%mm1, %%mm0\n"
-      "  paddd %%mm2, %%mm0\n"
-      "  paddd %%mm6, %%mm0\n"
-
-      "  psrad %%mm5, %%mm0\n"
-      "  pmaxsw %%mm7, %%mm0\n"
-      "  packuswb %%mm0, %%mm0\n"
-      "  movd %%mm0, %%ecx\n"
-      "  movb %%cl,0(%[d])\n"
-
-      "  addl $1, %[d]\n"
-      "  addl $1, %[s1_np9]\n"
-      "  decl %[n]\n"
-      "  jnz 1b\n"
-      "  emms\n"
-      : [d] "+r" (d),
-        [s1_np9] "+r" (s1_np9),
-        [n] "+m" (n)
-      : [s2_10] "r" (s2_10),
-        [s3_2] "r" (s3_2)
-      : "ecx");
-}
-OIL_DEFINE_IMPL_FULL (mas10_u8_mmx_4, mas10_u8, OIL_IMPL_FLAG_MMX|OIL_IMPL_FLAG_MMXEXT);
-
-
-void
-mas8_u8_mmx_3 (uint8_t *d, const uint8_t *s1_np7, const int16_t *s2_8,
-        const int16_t *s3_2, int n)
-{
-  int j;
-  int x;
-
-  while(n&3) {
-    x = 0;
-    for(j=0;j<8;j++){
-      x += s1_np7[j] * s2_8[j];
-    }
-    *d = CLAMP((x + s3_2[0])>>s3_2[1],0,255);
-    d++;
-    s1_np7++;
-    n--;
-  }
-
-  if (n == 0) return;
-  n>>=2;
-  __asm__ __volatile__("\n"
-      "  pxor %%mm7, %%mm7\n"
-
-      "  movd (%[s3_2]), %%mm6\n"
-
-      "  movzwl 2(%[s3_2]), %%ecx\n"
-      "  movd %%ecx, %%mm5\n"
-
-      "  movq 0(%[s2_8]), %%mm3\n"
-      "  movq 8(%[s2_8]), %%mm4\n"
-
-      "1:\n"
-      /* load 128 */
-      "  pshufw $0x00, %%mm6, %%mm2\n"
-
-      "  movd 0(%[s1_np7]), %%mm0\n"
-      "  punpcklbw %%mm7, %%mm0\n"
-      "  pshufw $0x00, %%mm3, %%mm1\n"
-      "  pmullw %%mm1, %%mm0\n"
-      "  paddw %%mm0, %%mm2\n"
-
-      "  movd 1(%[s1_np7]), %%mm0\n"
-      "  punpcklbw %%mm7, %%mm0\n"
-      "  pshufw $0x55*1, %%mm3, %%mm1\n"
-      "  pmullw %%mm1, %%mm0\n"
-      "  paddw %%mm0, %%mm2\n"
-
-      "  movd 2(%[s1_np7]), %%mm0\n"
-      "  punpcklbw %%mm7, %%mm0\n"
-      "  pshufw $0x55*2, %%mm3, %%mm1\n"
-      "  pmullw %%mm1, %%mm0\n"
-      "  paddw %%mm0, %%mm2\n"
-
-      "  movd 3(%[s1_np7]), %%mm0\n"
-      "  punpcklbw %%mm7, %%mm0\n"
-      "  pshufw $0x55*3, %%mm3, %%mm1\n"
-      "  pmullw %%mm1, %%mm0\n"
-      "  paddw %%mm0, %%mm2\n"
-
-      "  movd 4(%[s1_np7]), %%mm0\n"
-      "  punpcklbw %%mm7, %%mm0\n"
-      "  pshufw $0x00, %%mm4, %%mm1\n"
-      "  pmullw %%mm1, %%mm0\n"
-      "  paddw %%mm0, %%mm2\n"
-
-      "  movd 5(%[s1_np7]), %%mm0\n"
-      "  punpcklbw %%mm7, %%mm0\n"
-      "  pshufw $0x55*1, %%mm4, %%mm1\n"
-      "  pmullw %%mm1, %%mm0\n"
-      "  paddw %%mm0, %%mm2\n"
-
-      "  movd 6(%[s1_np7]), %%mm0\n"
-      "  punpcklbw %%mm7, %%mm0\n"
-      "  pshufw $0x55*2, %%mm4, %%mm1\n"
-      "  pmullw %%mm1, %%mm0\n"
-      "  paddw %%mm0, %%mm2\n"
-
-      "  movd 7(%[s1_np7]), %%mm0\n"
-      "  punpcklbw %%mm7, %%mm0\n"
-      "  pshufw $0x55*3, %%mm4, %%mm1\n"
-      "  pmullw %%mm1, %%mm0\n"
-      "  paddw %%mm0, %%mm2\n"
-
-      "  psraw %%mm5, %%mm2\n"
-      "  pmaxsw %%mm7, %%mm2\n"
-      "  packuswb %%mm2, %%mm2\n"
-      "  movd %%mm2, 0(%[d])\n"
-      "  addl $4, %[d]\n"
-      "  addl $4, %[s1_np7]\n"
-      "  decl %[n]\n"
-      "  jnz 1b\n"
-      "  emms\n"
-      : [d] "+r" (d),
-        [s1_np7] "+r" (s1_np7),
-        [n] "+m" (n)
-      : [s2_8] "r" (s2_8),
-        [s3_2] "r" (s3_2)
-      : "ecx");
-}
-OIL_DEFINE_IMPL_FULL (mas8_u8_mmx_3, mas8_u8_l15, OIL_IMPL_FLAG_MMX|OIL_IMPL_FLAG_MMXEXT);
-
-void
-mas8_u8_mmx_4 (uint8_t *d, const uint8_t *s1_np7, const int16_t *s2_8,
-        const int16_t *s3_2, int n)
-{
-  if (n == 0) return;
-  __asm__ __volatile__("\n"
-      "  pxor %%mm7, %%mm7\n"
-
-      "  movzwl 0(%[s3_2]), %%ecx\n"
-      "  movd %%ecx, %%mm6\n"
-
-      "  movzwl 2(%[s3_2]), %%ecx\n"
-      "  movd %%ecx, %%mm5\n"
-
-      "1:\n"
-      "  movd 0(%[s1_np7]), %%mm0\n"
-      "  punpcklbw %%mm7, %%mm0\n"
-      "  pmaddwd 0(%[s2_8]), %%mm0\n"
-
-      "  movd 4(%[s1_np7]), %%mm1\n"
-      "  punpcklbw %%mm7, %%mm1\n"
-      "  pmaddwd 8(%[s2_8]), %%mm1\n"
-
-      "  paddd %%mm1, %%mm0\n"
-      "  movq %%mm0, %%mm1\n"
-      "  psrlq $32, %%mm0\n"
-      "  paddd %%mm1, %%mm0\n"
-      "  paddd %%mm6, %%mm0\n"
-
-      "  psrad %%mm5, %%mm0\n"
-      "  pmaxsw %%mm7, %%mm0\n"
-      "  packuswb %%mm0, %%mm0\n"
-      "  movd %%mm0, %%ecx\n"
-      "  movb %%cl,0(%[d])\n"
-
-      "  addl $1, %[d]\n"
-      "  addl $1, %[s1_np7]\n"
-      "  decl %[n]\n"
-      "  jnz 1b\n"
-      "  emms\n"
-      : [d] "+r" (d),
-        [s1_np7] "+r" (s1_np7),
-        [n] "+m" (n)
-      : [s2_8] "r" (s2_8),
-        [s3_2] "r" (s3_2)
-      : "ecx");
-}
-OIL_DEFINE_IMPL_FULL (mas8_u8_mmx_4, mas8_u8, OIL_IMPL_FLAG_MMX|OIL_IMPL_FLAG_MMXEXT);
-
-void
-mas8_u8_sym_mmx_3 (uint8_t *d, const uint8_t *s1_np7, const int16_t *s2_8,
-        const int16_t *s3_2, int n)
-{
-  int j;
-  int x;
-
-  while(n&3) {
-    x = 0;
-    for(j=0;j<8;j++){
-      x += s1_np7[j] * s2_8[j];
-    }
-    *d = CLAMP((x + s3_2[0])>>s3_2[1],0,255);
-    d++;
-    s1_np7++;
-    n--;
-  }
-
-  if (n == 0) return;
-  n>>=2;
-  __asm__ __volatile__("\n"
-      "  pxor %%mm7, %%mm7\n"
-
-      "  movd (%[s3_2]), %%mm6\n"
-
-      "  movzwl 2(%[s3_2]), %%ecx\n"
-      "  movd %%ecx, %%mm5\n"
-
-      "  movq 0(%[s2_8]), %%mm3\n"
-      "  movq 8(%[s2_8]), %%mm4\n"
-
-      " .p2align 4,,15                  \n"
-      "1:\n"
-      /* load 128 */
-      "  pshufw $0x00, %%mm6, %%mm2\n"
-
-      "  movd 0(%[s1_np7]), %%mm0\n"
-      "  punpcklbw %%mm7, %%mm0\n"
-      "  movd 7(%[s1_np7]), %%mm1\n"
-      "  punpcklbw %%mm7, %%mm1\n"
-      "  paddw %%mm1, %%mm0\n"
-      //"  pshufw $0x00, %%mm3, %%mm1\n"
-      //"  pmullw %%mm1, %%mm0\n"
-      //"  paddw %%mm0, %%mm2\n"
-      "  psubw %%mm0, %%mm2\n"
-
-      "  movd 1(%[s1_np7]), %%mm0\n"
-      "  punpcklbw %%mm7, %%mm0\n"
-      "  movd 6(%[s1_np7]), %%mm1\n"
-      "  punpcklbw %%mm7, %%mm1\n"
-      "  paddw %%mm1, %%mm0\n"
-      "  pshufw $0x55*1, %%mm3, %%mm1\n"
-      "  pmullw %%mm1, %%mm0\n"
-      "  paddw %%mm0, %%mm2\n"
-
-      "  movd 2(%[s1_np7]), %%mm0\n"
-      "  punpcklbw %%mm7, %%mm0\n"
-      "  movd 5(%[s1_np7]), %%mm1\n"
-      "  punpcklbw %%mm7, %%mm1\n"
-      "  paddw %%mm1, %%mm0\n"
-      "  pshufw $0x55*2, %%mm3, %%mm1\n"
-      "  pmullw %%mm1, %%mm0\n"
-      "  paddw %%mm0, %%mm2\n"
-
-      "  movd 3(%[s1_np7]), %%mm0\n"
-      "  punpcklbw %%mm7, %%mm0\n"
-      "  movd 4(%[s1_np7]), %%mm1\n"
-      "  punpcklbw %%mm7, %%mm1\n"
-      "  paddw %%mm1, %%mm0\n"
-      "  pshufw $0x55*3, %%mm3, %%mm1\n"
-      "  pmullw %%mm1, %%mm0\n"
-      "  paddw %%mm0, %%mm2\n"
-
-      "  psraw %%mm5, %%mm2\n"
-      "  pmaxsw %%mm7, %%mm2\n"
-      "  packuswb %%mm2, %%mm2\n"
-      "  movd %%mm2, 0(%[d])\n"
-      "  addl $4, %[d]\n"
-      "  addl $4, %[s1_np7]\n"
-      "  decl %[n]\n"
-      "  jnz 1b\n"
-      "  emms\n"
-      : [d] "+r" (d),
-        [s1_np7] "+r" (s1_np7),
-        [n] "+m" (n)
-      : [s2_8] "r" (s2_8),
-        [s3_2] "r" (s3_2)
-      : "ecx");
-}
-OIL_DEFINE_IMPL_FULL (mas8_u8_sym_mmx_3, mas8_u8_sym_l15, OIL_IMPL_FLAG_MMX|OIL_IMPL_FLAG_MMXEXT);
-
-void
-mas8_u8_sym_mmx_41 (uint8_t *d, const uint8_t *s1_np7, const int16_t *s2_8,
-        const int16_t *s3_2, int n)
-{
-  int j;
-  int x;
-  int16_t tmp[16];
-
-  while(n&3) {
-    x = 0;
-    for(j=0;j<8;j++){
-      x += s1_np7[j] * s2_8[j];
-    }
-    *d = CLAMP((x + s3_2[0])>>s3_2[1],0,255);
-    d++;
-    s1_np7++;
-    n--;
-  }
-
-  if (n == 0) return;
-  n>>=2;
-  __asm__ __volatile__("\n"
-      "  pxor %%mm7, %%mm7\n"
-
-      "  movd (%[s3_2]), %%mm6\n"
-
-      "  movzwl 2(%[s3_2]), %%ecx\n"
-      "  movd %%ecx, %%mm5\n"
-
-      "  movq 0(%[s2_8]), %%mm3\n"
-      "  pshufw $0x55*0, %%mm3, %%mm1\n"
-      "  movq %%mm1, 0(%[coeff])\n"
-      "  pshufw $0x55*1, %%mm3, %%mm1\n"
-      "  movq %%mm1, 8(%[coeff])\n"
-      "  pshufw $0x55*2, %%mm3, %%mm1\n"
-      "  movq %%mm1, 16(%[coeff])\n"
-      "  pshufw $0x55*3, %%mm3, %%mm1\n"
-      "  movq %%mm1, 24(%[coeff])\n"
-      :
-      : [s2_8] "r" (s2_8),
-        [s3_2] "r" (s3_2),
-        [coeff] "r" (tmp)
-      : "ecx");
-
-  __asm__ __volatile__("\n"
-      " .p2align 4,,15                  \n"
-      "1:\n"
-      /* load 128 */
-      "  pshufw $0x00, %%mm6, %%mm2\n"
-
-      "  movd 0(%[s1_np7]), %%mm0\n"
-      "  punpcklbw %%mm7, %%mm0\n"
-      "  movd 7(%[s1_np7]), %%mm1\n"
-      "  punpcklbw %%mm7, %%mm1\n"
-      "  paddw %%mm1, %%mm0\n"
-      "  pmullw 0(%[coeff]), %%mm0\n"
-      "  paddw %%mm0, %%mm2\n"
-
-      "  movd 1(%[s1_np7]), %%mm0\n"
-      "  punpcklbw %%mm7, %%mm0\n"
-      "  movd 6(%[s1_np7]), %%mm1\n"
-      "  punpcklbw %%mm7, %%mm1\n"
-      "  paddw %%mm1, %%mm0\n"
-      "  pmullw 8(%[coeff]), %%mm0\n"
-      "  paddw %%mm0, %%mm2\n"
-
-      "  movd 2(%[s1_np7]), %%mm0\n"
-      "  punpcklbw %%mm7, %%mm0\n"
-      "  movd 5(%[s1_np7]), %%mm1\n"
-      "  punpcklbw %%mm7, %%mm1\n"
-      "  paddw %%mm1, %%mm0\n"
-      "  pmullw 16(%[coeff]), %%mm0\n"
-      "  paddw %%mm0, %%mm2\n"
-
-      "  movd 3(%[s1_np7]), %%mm0\n"
-      "  punpcklbw %%mm7, %%mm0\n"
-      "  movd 4(%[s1_np7]), %%mm1\n"
-      "  punpcklbw %%mm7, %%mm1\n"
-      "  paddw %%mm1, %%mm0\n"
-      "  pmullw 24(%[coeff]), %%mm0\n"
-      "  paddw %%mm0, %%mm2\n"
-
-      "  psraw %%mm5, %%mm2\n"
-      "  pmaxsw %%mm7, %%mm2\n"
-      "  packuswb %%mm2, %%mm2\n"
-      "  movd %%mm2, 0(%[d])\n"
-      "  addl $4, %[d]\n"
-      "  addl $4, %[s1_np7]\n"
-      "  decl %[n]\n"
-      "  jnz 1b\n"
-      "  emms\n"
-      : [d] "+r" (d),
-        [s1_np7] "+r" (s1_np7),
-        [n] "+m" (n)
-      : [s2_8] "r" (s2_8),
-        [coeff] "r" (tmp)
-      : "ecx");
-}
-OIL_DEFINE_IMPL_FULL (mas8_u8_sym_mmx_41, mas8_u8_sym_l15, OIL_IMPL_FLAG_MMX|OIL_IMPL_FLAG_MMXEXT);
-
-
-#define PSHUFW_3210 "0xe4"
-#define PSHUFW_0123 "0x1b"
-
-void
-mas8_u8_sym_mmx_5 (uint8_t *d, const uint8_t *s1_np7, const int16_t *s2_8,
-        const int16_t *s3_2, int n)
-{
-  if (n==0) return;
-  __asm__ __volatile__("\n"
-      "  pxor %%mm7, %%mm7\n"
-
-      "  movzwl 0(%[s3_2]), %%ecx\n"
-      "  movd %%ecx, %%mm6\n"
-      "  pshufw $0x44, %%mm6, %%mm6\n" // 01 00 01 00
-
-      "  movzwl 2(%[s3_2]), %%ecx\n"
-      "  movd %%ecx, %%mm5\n"
-
-      "  cmpl $0, %[n]\n"
-      "  jz 2f\n"
-
-      "1:\n"
-      "  movd 0(%[s1_np7]), %%mm0\n"
-      "  punpcklbw %%mm7, %%mm0\n"
-#if 1
-      "  movd 4(%[s1_np7]), %%mm1\n"
-      "  punpcklbw %%mm7, %%mm1\n"
-      "  pshufw $0x1b, %%mm1, %%mm1\n" // 00 01 10 11
-      "  paddw %%mm1, %%mm0\n"
-      "  pmaddwd 0(%[s2_8]), %%mm0\n"
-#else
-      "  pmaddwd 0(%[s2_8]), %%mm0\n"
-
-      "  movd 4(%[s1_np7]), %%mm1\n"
-      "  punpcklbw %%mm7, %%mm1\n"
-      "  pmaddwd 8(%[s2_8]), %%mm1\n"
-      "  paddd %%mm1, %%mm0\n"
-#endif
-      
-      "  pshufw $0xee, %%mm0, %%mm1\n" // 11 10 11 10
-      "  paddd %%mm1, %%mm0\n"
-      "  paddd %%mm6, %%mm0\n"
-
-      "  psrad %%mm5, %%mm0\n"
-      "  pmaxsw %%mm7, %%mm0\n"
-      "  packuswb %%mm0, %%mm0\n"
-      "  movd %%mm0, %%ecx\n"
-      "  movb %%cl,0(%[d])\n"
-
-      "  addl $1, %[d]\n"
-      "  addl $1, %[s1_np7]\n"
-      "  decl %[n]\n"
-      "  jnz 1b\n"
-
-      "2:\n"
-      "  emms\n"
-      : [d] "+r" (d),
-        [s1_np7] "+r" (s1_np7),
-        [n] "+m" (n)
-      : [s2_8] "r" (s2_8),
-        [s3_2] "r" (s3_2)
-      : "ecx");
-}
-OIL_DEFINE_IMPL_FULL (mas8_u8_sym_mmx_5, mas8_u8_sym_l15, OIL_IMPL_FLAG_MMX|OIL_IMPL_FLAG_MMXEXT);
-
-void
-mas8_u8_sym_mmx_6 (uint8_t *d, const uint8_t *s1_np7, const int16_t *s2_8,
-        const int16_t *s3_2, int n)
-{
-  int8_t coeff[8];
-  int8_t *ack;
-  int i;
-
-  for(i=0;i<8;i++){
-    //coeff[i] = s2_8[i];
-    coeff[i] = i;
-  }
-  ack = coeff;
-
-  if (n==0) return;
-  __asm__ __volatile__("\n"
-      "  pxor %%mm7, %%mm7\n"
-
-      "  movzwl 0(%[s3_2]), %%ecx\n"
-      "  movd %%ecx, %%mm6\n"
-      "  pshufw $0x44, %%mm6, %%mm6\n" // 01 00 01 00
-
-      "  movzwl 2(%[s3_2]), %%ecx\n"
-      "  movd %%ecx, %%mm5\n"
-
-      "  movq 0(%[s2_8]), %%mm4\n"
-      "  packsswb 8(%[s2_8]), %%mm4\n"
-
-      "1:\n"
-      "  movq 0(%[s1_np7]), %%mm0\n"
-      "  pmaddubsw %%mm4, %%mm0\n"
-
-#if 1
-      "  pshufw $0xee, %%mm0, %%mm1\n" // 11 10 11 10
-      "  paddw %%mm1, %%mm0\n"
-      "  pshufw $0x55, %%mm0, %%mm1\n" // 01 01 01 01
-      "  paddw %%mm1, %%mm0\n"
-#else
-      "  phaddw %%mm0, %%mm0\n"
-      "  phaddw %%mm0, %%mm0\n"
-#endif
-
-      "  paddw %%mm6, %%mm0\n"
-      "  psraw %%mm5, %%mm0\n"
-      "  pmaxsw %%mm7, %%mm0\n"
-      "  packuswb %%mm0, %%mm0\n"
-      "  movd %%mm0, %%ecx\n"
-      "  movb %%cl,0(%[d])\n"
-
-      "  addl $1, %[d]\n"
-      "  addl $1, %[s1_np7]\n"
-      "  decl %[n]\n"
-      "  jnz 1b\n"
-
-      "  emms\n"
-      : [d] "+r" (d),
-        [s1_np7] "+r" (s1_np7),
-        [n] "+m" (n)
-      : [s2_8] "r" (s2_8),
-        [s3_2] "r" (s3_2)
-      : "ecx");
-}
-OIL_DEFINE_IMPL_FULL (mas8_u8_sym_mmx_6, mas8_u8_sym_l15, OIL_IMPL_FLAG_MMX|OIL_IMPL_FLAG_MMXEXT|OIL_IMPL_FLAG_SSSE3);
-
-#ifdef ENABLE_BROKEN_IMPLS
-/* This only works for the taps array: -1, 3, -7, 21, 21, -7, 3, -1 */
-void
-mas8_u8_supersym_mmx (uint8_t *d, const uint8_t *s1_np7, const int16_t *s2_8,
-        const int16_t *s3_2, int n)
-{
-  int j;
-  int x;
-
-  while(n&3) {
-    x = 0;
-    for(j=0;j<8;j++){
-      x += s1_np7[j] * s2_8[j];
-    }
-    *d = CLAMP((x + s3_2[0])>>s3_2[1],0,255);
-    d++;
-    s1_np7++;
-    n--;
-  }
-
-  n>>=2;
-  __asm__ __volatile__("\n"
-      "  pxor %%mm7, %%mm7\n"
-
-      "  movd (%[s3_2]), %%mm6\n"
-      "  pshufw $0x00, %%mm6, %%mm6\n"
-
-      "  movzwl 2(%[s3_2]), %%ecx\n"
-      "  movd %%ecx, %%mm5\n"
-
-      "  movq 0(%[s2_8]), %%mm3\n"
-      "  movq 8(%[s2_8]), %%mm4\n"
-
-      "1:\n"
-      "  movd 0(%[s1_np7]), %%mm0\n"
-      "  punpcklbw %%mm7, %%mm0\n"
-      "  movd 7(%[s1_np7]), %%mm1\n"
-      "  punpcklbw %%mm7, %%mm1\n"
-      "  paddw %%mm1, %%mm0\n"
-
-      "  movd 1(%[s1_np7]), %%mm2\n"
-      "  punpcklbw %%mm7, %%mm2\n"
-      "  movd 6(%[s1_np7]), %%mm3\n"
-      "  punpcklbw %%mm7, %%mm3\n"
-      "  paddw %%mm3, %%mm2\n"
-
-      "  paddw %%mm2, %%mm0\n"
-      "  psllw $2, %%mm2\n"
-      "  psubw %%mm0, %%mm2\n"
-      "      movq %%mm2, %%mm4\n"
-
-      "  movd 2(%[s1_np7]), %%mm0\n"
-      "  punpcklbw %%mm7, %%mm0\n"
-      "  movd 5(%[s1_np7]), %%mm1\n"
-      "  punpcklbw %%mm7, %%mm1\n"
-      "  paddw %%mm1, %%mm0\n"
-
-      "  movd 3(%[s1_np7]), %%mm2\n"
-      "  punpcklbw %%mm7, %%mm2\n"
-      "  movd 4(%[s1_np7]), %%mm3\n"
-      "  punpcklbw %%mm7, %%mm3\n"
-      "  paddw %%mm3, %%mm2\n"
-
-      "  paddw %%mm2, %%mm0\n"
-      "  psllw $2, %%mm2\n"
-      "  psubw %%mm0, %%mm2\n"
-
-      "  psubw %%mm2, %%mm4\n"
-      "  psllw $3, %%mm2\n"
-      "  paddw %%mm4, %%mm2\n"
-
-      "  paddw %%mm6, %%mm2\n"
-
-      "  psraw %%mm5, %%mm2\n"
-      "  pmaxsw %%mm7, %%mm2\n"
-      "  packuswb %%mm2, %%mm2\n"
-      "  movd %%mm2, 0(%[d])\n"
-      "  addl $4, %[d]\n"
-      "  addl $4, %[s1_np7]\n"
-      "  decl %[n]\n"
-      "  jnz 1b\n"
-      "  emms\n"
-      : [d] "+r" (d),
-        [s1_np7] "+r" (s1_np7),
-        [n] "+m" (n)
-      : [s2_8] "r" (s2_8),
-        [s3_2] "r" (s3_2)
-      : "ecx");
-}
-OIL_DEFINE_IMPL_FULL (mas8_u8_supersym_mmx, mas8_u8_sym_l15, OIL_IMPL_FLAG_MMX|OIL_IMPL_FLAG_MMXEXT);
-#endif
-
-void
-mas12_addc_rshift_decim2_u8_mmx_4 (uint8_t *d, const uint8_t *s1_2xnp11,
-    const int16_t *s2_12, const int16_t *s3_2, int n)
-{
-  if (n == 0) return;
-  __asm__ __volatile__("\n"
-      "  pxor %%mm7, %%mm7\n"
-
-      "  movzwl 0(%[s3_2]), %%ecx\n"
-      "  movd %%ecx, %%mm6\n"
-
-      "  movzwl 2(%[s3_2]), %%ecx\n"
-      "  movd %%ecx, %%mm5\n"
-
-      "1:\n"
-      "  movd 0(%[s1_2xnp11]), %%mm0\n"
-      "  punpcklbw %%mm7, %%mm0\n"
-      "  pmaddwd 0(%[s2_12]), %%mm0\n"
-
-      "  movd 4(%[s1_2xnp11]), %%mm1\n"
-      "  punpcklbw %%mm7, %%mm1\n"
-      "  pmaddwd 8(%[s2_12]), %%mm1\n"
-      "  paddd %%mm1, %%mm0\n"
-
-      "  movd 8(%[s1_2xnp11]), %%mm1\n"
-      "  punpcklbw %%mm7, %%mm1\n"
-      "  pmaddwd 16(%[s2_12]), %%mm1\n"
-      "  paddd %%mm1, %%mm0\n"
-
-      "  movq %%mm0, %%mm1\n"
-      "  psrlq $32, %%mm0\n"
-      "  paddd %%mm1, %%mm0\n"
-      "  paddd %%mm6, %%mm0\n"
-
-      "  psrad %%mm5, %%mm0\n"
-      "  pmaxsw %%mm7, %%mm0\n"
-      "  packuswb %%mm0, %%mm0\n"
-      "  movd %%mm0, %%ecx\n"
-      "  movb %%cl,0(%[d])\n"
-
-      "  addl $1, %[d]\n"
-      "  addl $2, %[s1_2xnp11]\n"
-      "  decl %[n]\n"
-      "  jnz 1b\n"
-      "  emms\n"
-      : [d] "+r" (d),
-        [s1_2xnp11] "+r" (s1_2xnp11),
-        [n] "+m" (n)
-      : [s2_12] "r" (s2_12),
-        [s3_2] "r" (s3_2)
-      : "ecx");
-}
-OIL_DEFINE_IMPL_FULL (mas12_addc_rshift_decim2_u8_mmx_4,
-    mas12_addc_rshift_decim2_u8, OIL_IMPL_FLAG_MMX|OIL_IMPL_FLAG_MMXEXT);
-
-#if 0
-void
-mas8_addc_rshift_decim2_u8_mmx_4 (uint8_t *d, const uint8_t *s1_2xnp9,
-    const int16_t *s2_8, const int16_t *s3_2, int n)
-{
-  if (n == 0) return;
-  __asm__ __volatile__("\n"
-      "  pxor %%mm7, %%mm7\n"
-
-      "  movzwl 0(%[s3_2]), %%ecx\n"
-      "  movd %%ecx, %%mm6\n"
-
-      "  movzwl 2(%[s3_2]), %%ecx\n"
-      "  movd %%ecx, %%mm5\n"
-
-      "1:\n"
-      "  movd 0(%[s1_2xnp9]), %%mm0\n"
-      "  punpcklbw %%mm7, %%mm0\n"
-      "  pmaddwd 0(%[s2_8]), %%mm0\n"
-
-      "  movd 4(%[s1_2xnp9]), %%mm1\n"
-      "  punpcklbw %%mm7, %%mm1\n"
-      "  pmaddwd 8(%[s2_8]), %%mm1\n"
-      "  paddd %%mm1, %%mm0\n"
-
-      "  movq %%mm0, %%mm1\n"
-      "  psrlq $32, %%mm0\n"
-      "  paddd %%mm1, %%mm0\n"
-      "  paddd %%mm6, %%mm0\n"
-
-      "  psrad %%mm5, %%mm0\n"
-      "  pmaxsw %%mm7, %%mm0\n"
-      "  packuswb %%mm0, %%mm0\n"
-      "  movd %%mm0, %%ecx\n"
-      "  movb %%cl,0(%[d])\n"
-
-      "  addl $1, %[d]\n"
-      "  addl $2, %[s1_2xnp9]\n"
-      "  decl %[n]\n"
-      "  jnz 1b\n"
-      "  emms\n"
-      : [d] "+r" (d),
-        [s1_2xnp9] "+r" (s1_2xnp9),
-        [n] "+m" (n)
-      : [s2_8] "r" (s2_8),
-        [s3_2] "r" (s3_2)
-      : "ecx");
-}
-OIL_DEFINE_IMPL_FULL (mas8_addc_rshift_decim2_u8_mmx_4,
-    mas8_addc_rshift_decim2_u8, OIL_IMPL_FLAG_MMX|OIL_IMPL_FLAG_MMXEXT);
-
-#endif
-
-void
-mas8_across_u8_mmx_3 (uint8_t *d, const uint8_t *s1_nx8, int ss1,
-    const int16_t *s2_8, const int16_t *s3_2, int n)
-{
-  int i;
-  int x;
-
-  while(n&3) {
-    x = 0;
-    for(i=0;i<8;i++){
-      x += OIL_GET(s1_nx8, i*ss1, uint8_t)*s2_8[i];
-    }
-    *d = CLAMP((x + s3_2[0])>>s3_2[1],0,255);
-    d++;
-    s1_nx8++;
-    n--;
-  }
-
-  if (n == 0) return;
-  n>>=2;
-  __asm__ __volatile__("\n"
-      "  pxor %%mm7, %%mm7\n"
-
-      "  movd (%[s3_2]), %%mm6\n"
-
-      "  movzwl 2(%[s3_2]), %%ecx\n"
-      "  movd %%ecx, %%mm5\n"
-
-      "  movq 0(%[s2_8]), %%mm3\n"
-      "  movq 8(%[s2_8]), %%mm4\n"
-      :
-      : [s2_8] "r" (s2_8),
-        [s3_2] "r" (s3_2)
-      : "ecx");
-
-  while (n > 0) {
-    const uint8_t *p = s1_nx8;
-  __asm__ __volatile__("\n"
-      "1:\n"
-      /* load 128 */
-      "  pshufw $0x00, %%mm6, %%mm2\n"
-
-      "  movd 0(%[p]), %%mm0\n"
-      "  addl %[ss1], %[p]\n"
-      "  punpcklbw %%mm7, %%mm0\n"
-      "  pshufw $0x00, %%mm3, %%mm1\n"
-      "  pmullw %%mm1, %%mm0\n"
-      "  paddw %%mm0, %%mm2\n"
-
-      "  movd 0(%[p]), %%mm0\n"
-      "  addl %[ss1], %[p]\n"
-      "  punpcklbw %%mm7, %%mm0\n"
-      "  pshufw $0x55*1, %%mm3, %%mm1\n"
-      "  pmullw %%mm1, %%mm0\n"
-      "  paddw %%mm0, %%mm2\n"
-
-      "  movd 0(%[p]), %%mm0\n"
-      "  addl %[ss1], %[p]\n"
-      "  punpcklbw %%mm7, %%mm0\n"
-      "  pshufw $0x55*2, %%mm3, %%mm1\n"
-      "  pmullw %%mm1, %%mm0\n"
-      "  paddw %%mm0, %%mm2\n"
-
-      "  movd 0(%[p]), %%mm0\n"
-      "  addl %[ss1], %[p]\n"
-      "  punpcklbw %%mm7, %%mm0\n"
-      "  pshufw $0x55*3, %%mm3, %%mm1\n"
-      "  pmullw %%mm1, %%mm0\n"
-      "  paddw %%mm0, %%mm2\n"
-
-      "  movd 0(%[p]), %%mm0\n"
-      "  addl %[ss1], %[p]\n"
-      "  punpcklbw %%mm7, %%mm0\n"
-      "  pshufw $0x00, %%mm4, %%mm1\n"
-      "  pmullw %%mm1, %%mm0\n"
-      "  paddw %%mm0, %%mm2\n"
-
-      "  movd 0(%[p]), %%mm0\n"
-      "  addl %[ss1], %[p]\n"
-      "  punpcklbw %%mm7, %%mm0\n"
-      "  pshufw $0x55*1, %%mm4, %%mm1\n"
-      "  pmullw %%mm1, %%mm0\n"
-      "  paddw %%mm0, %%mm2\n"
-
-      "  movd 0(%[p]), %%mm0\n"
-      "  addl %[ss1], %[p]\n"
-      "  punpcklbw %%mm7, %%mm0\n"
-      "  pshufw $0x55*2, %%mm4, %%mm1\n"
-      "  pmullw %%mm1, %%mm0\n"
-      "  paddw %%mm0, %%mm2\n"
-
-      "  movd 0(%[p]), %%mm0\n"
-      "  addl %[ss1], %[p]\n"
-      "  punpcklbw %%mm7, %%mm0\n"
-      "  pshufw $0x55*3, %%mm4, %%mm1\n"
-      "  pmullw %%mm1, %%mm0\n"
-      "  paddw %%mm0, %%mm2\n"
-
-      "  psraw %%mm5, %%mm2\n"
-      "  pmaxsw %%mm7, %%mm2\n"
-      "  packuswb %%mm2, %%mm2\n"
-      "  movd %%mm2, 0(%[d])\n"
-      : [p] "+r" (p)
-      : [d] "r" (d), [ss1] "r" (ss1));
-    d+=4;
-    s1_nx8+=4;
-    n--;
-  }
-
-  asm volatile ("emms");
-}
-OIL_DEFINE_IMPL_FULL (mas8_across_u8_mmx_3, mas8_across_u8, OIL_IMPL_FLAG_MMX|OIL_IMPL_FLAG_MMXEXT);
-
diff --git a/liboil/i386/wavelet.c b/liboil/i386/wavelet.c
index ec6fa3c..16b779a 100644
--- a/liboil/i386/wavelet.c
+++ b/liboil/i386/wavelet.c
@@ -2049,730 +2049,3 @@ lshift_s16_mmx_2(int16_t *d1, int16_t *s1, int16_t *s3_1, int n)
 OIL_DEFINE_IMPL_FULL (lshift_s16_mmx_2, lshift_s16, OIL_IMPL_FLAG_MMX);
 
 
-void
-multiply_and_acc_6xn_s16_u8_mmx (int16_t *i1, int is1, int16_t *s1,
-    int ss1, uint8_t *s2, int ss2, int n)
-{
-  /* FIXME this reads outside the arrays.  Bad. */
-  if (n==0) return;
-  __asm__ __volatile__ ("\n"
-      "  pxor %%mm7, %%mm7\n"
-      "1:\n"
-      "  movd 0(%2), %%mm0\n"
-      "  punpcklbw %%mm7, %%mm0\n"
-      "  pmullw 0(%1), %%mm0\n"
-      "  paddw 0(%0), %%mm0\n"
-      "  movq %%mm0, 0(%0)\n"
-      "   movd 4(%2), %%mm1\n"
-      "   punpcklbw %%mm7, %%mm1\n"
-      "   pmullw 8(%1), %%mm1\n"
-      "   paddw 8(%0), %%mm1\n"
-      "   movd %%mm1, 8(%0)\n"
-
-      "  addl %4, %0\n"
-      "  addl %5, %1\n"
-      "  addl %6, %2\n"
-      "  decl %3\n"
-      "  jnz 1b\n"
-      "  emms\n"
-      : "+r" (i1), "+r" (s1), "+r" (s2), "+r" (n)
-      : "m" (is1), "m" (ss1), "m" (ss2)
-      );
-}
-OIL_DEFINE_IMPL_FULL (multiply_and_acc_6xn_s16_u8_mmx,
-    multiply_and_acc_6xn_s16_u8, OIL_IMPL_FLAG_MMX);
-
-void
-multiply_and_acc_8xn_s16_u8_mmx (int16_t *i1, int is1, int16_t *s1,
-    int ss1, uint8_t *s2, int ss2, int n)
-{
-  if (n==0) return;
-  __asm__ __volatile__ ("\n"
-      "  pxor %%mm7, %%mm7\n"
-      "1:\n"
-      "  movd 0(%2), %%mm0\n"
-      "  punpcklbw %%mm7, %%mm0\n"
-      "  pmullw 0(%1), %%mm0\n"
-      "  paddw 0(%0), %%mm0\n"
-      "  movq %%mm0, 0(%0)\n"
-      "   movd 4(%2), %%mm1\n"
-      "   punpcklbw %%mm7, %%mm1\n"
-      "   pmullw 8(%1), %%mm1\n"
-      "   paddw 8(%0), %%mm1\n"
-      "   movq %%mm1, 8(%0)\n"
-
-      "  addl %4, %0\n"
-      "  addl %5, %1\n"
-      "  addl %6, %2\n"
-      "  decl %3\n"
-      "  jnz 1b\n"
-      "  emms\n"
-      : "+r" (i1), "+r" (s1), "+r" (s2), "+r" (n)
-      : "m" (is1), "m" (ss1), "m" (ss2)
-      );
-}
-OIL_DEFINE_IMPL_FULL (multiply_and_acc_8xn_s16_u8_mmx,
-    multiply_and_acc_8xn_s16_u8, OIL_IMPL_FLAG_MMX);
-
-void
-multiply_and_acc_16xn_s16_u8_mmx (int16_t *i1, int is1, int16_t *s1,
-    int ss1, uint8_t *s2, int ss2, int n)
-{
-  if (n==0) return;
-  __asm__ __volatile__ ("\n"
-      "  pxor %%mm7, %%mm7\n"
-      "1:\n"
-      "  movd 0(%2), %%mm0\n"
-      "  punpcklbw %%mm7, %%mm0\n"
-      "  pmullw 0(%1), %%mm0\n"
-      "  paddw 0(%0), %%mm0\n"
-      "  movq %%mm0, 0(%0)\n"
-      "   movd 4(%2), %%mm1\n"
-      "   punpcklbw %%mm7, %%mm1\n"
-      "   pmullw 8(%1), %%mm1\n"
-      "   paddw 8(%0), %%mm1\n"
-      "   movq %%mm1, 8(%0)\n"
-      "    movd 8(%2), %%mm2\n"
-      "    punpcklbw %%mm7, %%mm2\n"
-      "    pmullw 16(%1), %%mm2\n"
-      "    paddw 16(%0), %%mm2\n"
-      "    movq %%mm2, 16(%0)\n"
-      "     movd 12(%2), %%mm2\n"
-      "     punpcklbw %%mm7, %%mm2\n"
-      "     pmullw 24(%1), %%mm2\n"
-      "     paddw 24(%0), %%mm2\n"
-      "     movq %%mm2, 24(%0)\n"
-
-      "  addl %4, %0\n"
-      "  addl %5, %1\n"
-      "  addl %6, %2\n"
-      "  decl %3\n"
-      "  jnz 1b\n"
-      "  emms\n"
-      : "+r" (i1), "+r" (s1), "+r" (s2), "+r" (n)
-      : "m" (is1), "m" (ss1), "m" (ss2)
-      );
-}
-OIL_DEFINE_IMPL_FULL (multiply_and_acc_16xn_s16_u8_mmx,
-    multiply_and_acc_16xn_s16_u8, OIL_IMPL_FLAG_MMX);
-
-void
-multiply_and_acc_24xn_s16_u8_mmx (int16_t *i1, int is1, int16_t *s1,
-    int ss1, uint8_t *s2, int ss2, int n)
-{
-  if (n==0) return;
-  __asm__ __volatile__ ("\n"
-      "  pxor %%mm7, %%mm7\n"
-      "1:\n"
-      "  movd 0(%2), %%mm0\n"
-      "  punpcklbw %%mm7, %%mm0\n"
-      "  pmullw 0(%1), %%mm0\n"
-      "  paddw 0(%0), %%mm0\n"
-      "  movq %%mm0, 0(%0)\n"
-      "   movd 4(%2), %%mm1\n"
-      "   punpcklbw %%mm7, %%mm1\n"
-      "   pmullw 8(%1), %%mm1\n"
-      "   paddw 8(%0), %%mm1\n"
-      "   movq %%mm1, 8(%0)\n"
-      "    movd 8(%2), %%mm2\n"
-      "    punpcklbw %%mm7, %%mm2\n"
-      "    pmullw 16(%1), %%mm2\n"
-      "    paddw 16(%0), %%mm2\n"
-      "    movq %%mm2, 16(%0)\n"
-      "     movd 12(%2), %%mm2\n"
-      "     punpcklbw %%mm7, %%mm2\n"
-      "     pmullw 24(%1), %%mm2\n"
-      "     paddw 24(%0), %%mm2\n"
-      "     movq %%mm2, 24(%0)\n"
-      " movd 16(%2), %%mm2\n"
-      " punpcklbw %%mm7, %%mm2\n"
-      " pmullw 32(%1), %%mm2\n"
-      " paddw 32(%0), %%mm2\n"
-      " movq %%mm2, 32(%0)\n"
-      "  movd 20(%2), %%mm2\n"
-      "  punpcklbw %%mm7, %%mm2\n"
-      "  pmullw 40(%1), %%mm2\n"
-      "  paddw 40(%0), %%mm2\n"
-      "  movq %%mm2, 40(%0)\n"
-
-      "  addl %4, %0\n"
-      "  addl %5, %1\n"
-      "  addl %6, %2\n"
-      "  decl %3\n"
-      "  jnz 1b\n"
-      "  emms\n"
-      : "+r" (i1), "+r" (s1), "+r" (s2), "+r" (n)
-      : "m" (is1), "m" (ss1), "m" (ss2)
-      );
-}
-OIL_DEFINE_IMPL_FULL (multiply_and_acc_24xn_s16_u8_mmx,
-    multiply_and_acc_24xn_s16_u8, OIL_IMPL_FLAG_MMX);
-
-
-#if 0
-void
-mas2_add_s16_sse (int16_t *d1, int16_t *s1, int16_t *s2, int16_t *s3_2,
-    int16_t *s4_2, int n)
-{
-  int shift = s4_2[1];
-
-  while (n&7) {
-    int x;
-
-    x = s4_2[0] + s2[0]*s3_2[0] + s2[1]*s3_2[1];
-    x >>= s4_2[1];
-    d1[0] = s1[0] + x;
-
-    d1++;
-    s1++;
-    s2++;
-    n--;
-  }
-  if (n==0) return;
-
-  n>>=3;
-  asm volatile ("\n"
-      "  movzwl 0(%0), %%ecx\n"
-      "  movd %%ecx, %%xmm7\n"
-      "  pshuflw $0x00, %%xmm7, %%xmm7\n"
-      "  pshufd $0x00, %%xmm7, %%xmm7\n"
-      "  movzwl 2(%0), %%ecx\n"
-      "  movd %%ecx, %%xmm6\n"
-      "  pshuflw $0x00, %%xmm6, %%xmm6\n"
-      "  pshufd $0x00, %%xmm6, %%xmm6\n"
-      "  movzwl 0(%1), %%ecx\n"
-      "  movd %%ecx, %%xmm5\n"
-      "  pshuflw $0x44, %%xmm5, %%xmm5\n"
-      "  pshufd $0x00, %%xmm5, %%xmm5\n"
-      :: "r" (s3_2), "r" (s4_2)
-      : "ecx"
-      );
-  asm volatile ("\n"
-      "1:\n"
-      "  movdqu 0(%2), %%xmm0\n"       // mm0 = s0, s1, s2, s3
-      "  movdqu 0(%2), %%xmm1\n"       // mm1 = s0, s1, s2, s3
-      "  pmullw %%xmm7, %%xmm0\n"     // mm0 = lo(s0*a0), lo(s1*a0), ...
-      "  pmulhw %%xmm7, %%xmm1\n"     // mm1 = hi(s0*a0), hi(s1*a0), ...
-      "  movdqu %%xmm0, %%xmm2\n"       // mm2 = lo(s0*a0), lo(s1*a0), ...
-      "  punpcklwd %%xmm1, %%xmm0\n"  // mm0 = s0*a0, s1*a0
-      "  punpckhwd %%xmm1, %%xmm2\n"  // mm2 = s2*a0, s3*a0
-      "  movdqu %%xmm2, %%xmm1\n"       // mm1 = s2*a0, s3*a0
-
-      "  movdqu 2(%2), %%xmm2\n"
-      "  movdqu 2(%2), %%xmm3\n"
-      "  pmullw %%xmm6, %%xmm2\n"
-      "  pmulhw %%xmm6, %%xmm3\n"
-      "  movdqu %%xmm2, %%xmm4\n"
-      "  punpcklwd %%xmm3, %%xmm2\n"  // mm2 = s1*a1, s2*a1
-      "  punpckhwd %%xmm3, %%xmm4\n"  // mm4 = s3*a1, s4*a1
-      "  movdqu %%xmm4, %%xmm3\n"       // mm3 = s3*a1, s4*a1
-
-      "  paddd %%xmm3, %%xmm1\n"      // mm1 = s2*a0 + s3*a1, ...
-      "  paddd %%xmm2, %%xmm0\n"      // mm0 = s0*a0 + s1*a1, ...
-
-      "  paddd %%xmm5, %%xmm1\n"      // mm1 = s2*a0 + s3*a1 + offset, ...
-      "  paddd %%xmm5, %%xmm0\n"      // mm0 = s0*a0 + s1*a1 + offset, ...
-
-      "  movd %4, %%xmm4\n"
-      "  psrad %%xmm4, %%xmm1\n"      // mm1 = (s2*a0 + s3*a1 + offset)>>shift, ...
-      "  psrad %%xmm4, %%xmm0\n"      // mm0 = (s0*a0 + s1*a1 + offset)>>shift, ...
-
-      "  packssdw %%xmm1, %%xmm0\n"
-      "  paddw 0(%1), %%xmm0\n"
-      "  movdqu %%xmm0, 0(%0)\n"
-      "  add $16, %0\n"
-      "  add $16, %1\n"
-      "  add $16, %2\n"
-      "  decl %3\n"
-      "  jnz 1b\n"
-      : "+r" (d1), "+r" (s1), "+r" (s2), "+r" (n)
-      : "r" (shift)
-      );
-}
-OIL_DEFINE_IMPL_FULL (mas2_add_s16_sse, mas2_add_s16, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT);
-#endif
-
-
-
-
-void
-add2_rshift_add_s16_mmx (int16_t *d1, int16_t *s1, int16_t *s2, int16_t *s3,
-    int16_t *s4_2, int n)
-{
-  while (n&3) {
-    int x;
-
-    x = s4_2[0] + s2[0] + s3[0];
-    x >>= s4_2[1];
-    d1[0] = s1[0] + x;
-
-    d1++;
-    s1++;
-    s2++;
-    s3++;
-    n--;
-  }
-  if (n==0) return;
-
-  n>>=2;
-  asm volatile ("\n"
-      "  movd 0(%[s4_2]), %%mm4\n"
-      "  pshufw $0x00, %%mm4, %%mm4\n"
-      "  movzwl 2(%[s4_2]), %%ecx\n"
-      "  movd %%ecx, %%mm5\n"
-      :: [s4_2] "r" (s4_2)
-      : "ecx");
-  asm volatile ("\n"
-      "1:\n"
-      "  movq %%mm4, %%mm0\n"
-      "  paddw 0(%[s2]), %%mm0\n"
-      "  paddw 0(%[s3]), %%mm0\n"
-      "  psraw %%mm5, %%mm0\n"
-      "  paddw 0(%[s1]), %%mm0\n"
-      "  movq %%mm0, 0(%[d1])\n"
-
-      "  add $8, %[d1]\n"
-      "  add $8, %[s1]\n"
-      "  add $8, %[s2]\n"
-      "  add $8, %[s3]\n"
-      "  decl %[n]\n"
-      "  jnz 1b\n"
-      "  emms\n"
-      : [d1] "+r" (d1), [s1] "+r" (s1), [s2] "+r" (s2),
-        [s3] "+r" (s3), [n] "+r" (n)
-      :
-      );
-}
-OIL_DEFINE_IMPL_FULL (add2_rshift_add_s16_mmx, add2_rshift_add_s16, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT);
-
-void
-add2_rshift_sub_s16_mmx (int16_t *d1, int16_t *s1, int16_t *s2, int16_t *s3,
-    int16_t *s4_2, int n)
-{
-  while (n&3) {
-    int x;
-
-    x = s4_2[0] + s2[0] + s3[0];
-    x >>= s4_2[1];
-    d1[0] = s1[0] - x;
-
-    d1++;
-    s1++;
-    s2++;
-    s3++;
-    n--;
-  }
-  if (n==0) return;
-
-  n>>=2;
-  asm volatile ("\n"
-      "  movd 0(%[s4_2]), %%mm4\n"
-      "  pshufw $0x00, %%mm4, %%mm4\n"
-      "  movzwl 2(%[s4_2]), %%ecx\n"
-      "  movd %%ecx, %%mm5\n"
-      :: [s4_2] "r" (s4_2)
-      : "ecx");
-  asm volatile ("\n"
-      "1:\n"
-      "  movq %%mm4, %%mm0\n"
-      "  paddw 0(%[s2]), %%mm0\n"
-      "  paddw 0(%[s3]), %%mm0\n"
-      "  psraw %%mm5, %%mm0\n"
-      "  movq 0(%[s1]), %%mm1\n"
-      "  psubw %%mm0, %%mm1\n"
-      "  movq %%mm1, 0(%[d1])\n"
-
-      "  add $8, %[d1]\n"
-      "  add $8, %[s1]\n"
-      "  add $8, %[s2]\n"
-      "  add $8, %[s3]\n"
-      "  decl %[n]\n"
-      "  jnz 1b\n"
-      "  emms\n"
-      : [d1] "+r" (d1), [s1] "+r" (s1), [s2] "+r" (s2),
-        [s3] "+r" (s3), [n] "+r" (n)
-      :
-      );
-}
-OIL_DEFINE_IMPL_FULL (add2_rshift_sub_s16_mmx, add2_rshift_sub_s16, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT);
-
-void
-add2_rshift_add_s16_mmx_unroll2 (int16_t *d1, int16_t *s1, int16_t *s2, int16_t *s3,
-    int16_t *s4_2, int n)
-{
-  while (n&7) {
-    int x;
-
-    x = s4_2[0] + s2[0] + s3[0];
-    x >>= s4_2[1];
-    d1[0] = s1[0] + x;
-
-    d1++;
-    s1++;
-    s2++;
-    s3++;
-    n--;
-  }
-  if (n==0) return;
-
-  n>>=3;
-  asm volatile ("\n"
-      "  movd 0(%[s4_2]), %%mm4\n"
-      "  pshufw $0x00, %%mm4, %%mm4\n"
-      "  movzwl 2(%[s4_2]), %%ecx\n"
-      "  movd %%ecx, %%mm5\n"
-      :: [s4_2] "r" (s4_2)
-      : "ecx");
-  asm volatile ("\n"
-      "1:\n"
-      "  movq %%mm4, %%mm0\n"
-      "  paddw 0(%[s2]), %%mm0\n"
-      "  paddw 0(%[s3]), %%mm0\n"
-      "  psraw %%mm5, %%mm0\n"
-      "  paddw 0(%[s1]), %%mm0\n"
-      "  movq %%mm0, 0(%[d1])\n"
-
-      "  movq %%mm4, %%mm0\n"
-      "  paddw 8(%[s2]), %%mm0\n"
-      "  paddw 8(%[s3]), %%mm0\n"
-      "  psraw %%mm5, %%mm0\n"
-      "  paddw 8(%[s1]), %%mm0\n"
-      "  movq %%mm0, 8(%[d1])\n"
-
-      "  add $16, %[d1]\n"
-      "  add $16, %[s1]\n"
-      "  add $16, %[s2]\n"
-      "  add $16, %[s3]\n"
-      "  decl %[n]\n"
-      "  jnz 1b\n"
-      "  emms\n"
-      : [d1] "+r" (d1), [s1] "+r" (s1), [s2] "+r" (s2),
-        [s3] "+r" (s3), [n] "+r" (n)
-      :
-      );
-}
-OIL_DEFINE_IMPL_FULL (add2_rshift_add_s16_mmx_unroll2, add2_rshift_add_s16, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT);
-
-void
-add2_rshift_sub_s16_mmx_unroll2 (int16_t *d1, int16_t *s1, int16_t *s2, int16_t *s3,
-    int16_t *s4_2, int n)
-{
-  while (n&7) {
-    int x;
-
-    x = s4_2[0] + s2[0] + s3[0];
-    x >>= s4_2[1];
-    d1[0] = s1[0] - x;
-
-    d1++;
-    s1++;
-    s2++;
-    s3++;
-    n--;
-  }
-  if (n==0) return;
-
-  n>>=3;
-  asm volatile ("\n"
-      "  movd 0(%[s4_2]), %%mm4\n"
-      "  pshufw $0x00, %%mm4, %%mm4\n"
-      "  movzwl 2(%[s4_2]), %%ecx\n"
-      "  movd %%ecx, %%mm5\n"
-      :: [s4_2] "r" (s4_2)
-      : "ecx");
-  asm volatile ("\n"
-      "1:\n"
-      "  movq %%mm4, %%mm0\n"
-      "  paddw 0(%[s2]), %%mm0\n"
-      "  paddw 0(%[s3]), %%mm0\n"
-      "  psraw %%mm5, %%mm0\n"
-      "  movq 0(%[s1]), %%mm1\n"
-      "  psubw %%mm0, %%mm1\n"
-      "  movq %%mm1, 0(%[d1])\n"
-
-      "  movq %%mm4, %%mm0\n"
-      "  paddw 8(%[s2]), %%mm0\n"
-      "  paddw 8(%[s3]), %%mm0\n"
-      "  psraw %%mm5, %%mm0\n"
-      "  movq 8(%[s1]), %%mm1\n"
-      "  psubw %%mm0, %%mm1\n"
-      "  movq %%mm1, 8(%[d1])\n"
-
-      "  add $16, %[d1]\n"
-      "  add $16, %[s1]\n"
-      "  add $16, %[s2]\n"
-      "  add $16, %[s3]\n"
-      "  decl %[n]\n"
-      "  jnz 1b\n"
-      "  emms\n"
-      : [d1] "+r" (d1), [s1] "+r" (s1), [s2] "+r" (s2),
-        [s3] "+r" (s3), [n] "+r" (n)
-      :
-      );
-}
-OIL_DEFINE_IMPL_FULL (add2_rshift_sub_s16_mmx_unroll2, add2_rshift_sub_s16, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT);
-
-void
-add2_rshift_add_s16_mmx_unroll4 (int16_t *d1, int16_t *s1, int16_t *s2, int16_t *s3,
-    int16_t *s4_2, int n)
-{
-  while (n&15) {
-    int x;
-
-    x = s4_2[0] + s2[0] + s3[0];
-    x >>= s4_2[1];
-    d1[0] = s1[0] + x;
-
-    d1++;
-    s1++;
-    s2++;
-    s3++;
-    n--;
-  }
-  if (n==0) return;
-
-  n>>=4;
-  asm volatile ("\n"
-      "  movd 0(%[s4_2]), %%mm4\n"
-      "  pshufw $0x00, %%mm4, %%mm4\n"
-      "  movzwl 2(%[s4_2]), %%ecx\n"
-      "  movd %%ecx, %%mm5\n"
-      :: [s4_2] "r" (s4_2)
-      : "ecx");
-  asm volatile ("\n"
-      "1:\n"
-      "  movq %%mm4, %%mm0\n"
-      "  paddw 0(%[s2]), %%mm0\n"
-      "  paddw 0(%[s3]), %%mm0\n"
-      "  psraw %%mm5, %%mm0\n"
-      "  paddw 0(%[s1]), %%mm0\n"
-      "  movq %%mm0, 0(%[d1])\n"
-
-      "  movq %%mm4, %%mm0\n"
-      "  paddw 8(%[s2]), %%mm0\n"
-      "  paddw 8(%[s3]), %%mm0\n"
-      "  psraw %%mm5, %%mm0\n"
-      "  paddw 8(%[s1]), %%mm0\n"
-      "  movq %%mm0, 8(%[d1])\n"
-
-      "  movq %%mm4, %%mm0\n"
-      "  paddw 16(%[s2]), %%mm0\n"
-      "  paddw 16(%[s3]), %%mm0\n"
-      "  psraw %%mm5, %%mm0\n"
-      "  paddw 16(%[s1]), %%mm0\n"
-      "  movq %%mm0, 16(%[d1])\n"
-
-      "  movq %%mm4, %%mm0\n"
-      "  paddw 24(%[s2]), %%mm0\n"
-      "  paddw 24(%[s3]), %%mm0\n"
-      "  psraw %%mm5, %%mm0\n"
-      "  paddw 24(%[s1]), %%mm0\n"
-      "  movq %%mm0, 24(%[d1])\n"
-
-      "  add $32, %[d1]\n"
-      "  add $32, %[s1]\n"
-      "  add $32, %[s2]\n"
-      "  add $32, %[s3]\n"
-      "  decl %[n]\n"
-      "  jnz 1b\n"
-      "  emms\n"
-      : [d1] "+r" (d1), [s1] "+r" (s1), [s2] "+r" (s2),
-        [s3] "+r" (s3), [n] "+r" (n)
-      :
-      );
-}
-OIL_DEFINE_IMPL_FULL (add2_rshift_add_s16_mmx_unroll4, add2_rshift_add_s16, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT);
-
-void
-add2_rshift_sub_s16_mmx_unroll4 (int16_t *d1, int16_t *s1, int16_t *s2, int16_t *s3,
-    int16_t *s4_2, int n)
-{
-  while (n&15) {
-    int x;
-
-    x = s4_2[0] + s2[0] + s3[0];
-    x >>= s4_2[1];
-    d1[0] = s1[0] - x;
-
-    d1++;
-    s1++;
-    s2++;
-    s3++;
-    n--;
-  }
-  if (n==0) return;
-
-  n>>=4;
-  asm volatile ("\n"
-      "  movd 0(%[s4_2]), %%mm4\n"
-      "  pshufw $0x00, %%mm4, %%mm4\n"
-      "  movzwl 2(%[s4_2]), %%ecx\n"
-      "  movd %%ecx, %%mm5\n"
-      :: [s4_2] "r" (s4_2)
-      : "ecx");
-  asm volatile ("\n"
-      "1:\n"
-      "  movq %%mm4, %%mm0\n"
-      "  paddw 0(%[s2]), %%mm0\n"
-      "  paddw 0(%[s3]), %%mm0\n"
-      "  psraw %%mm5, %%mm0\n"
-      "  movq 0(%[s1]), %%mm1\n"
-      "  psubw %%mm0, %%mm1\n"
-      "  movq %%mm1, 0(%[d1])\n"
-
-      "  movq %%mm4, %%mm2\n"
-      "  paddw 8(%[s2]), %%mm2\n"
-      "  paddw 8(%[s3]), %%mm2\n"
-      "  psraw %%mm5, %%mm2\n"
-      "  movq 8(%[s1]), %%mm3\n"
-      "  psubw %%mm2, %%mm3\n"
-      "  movq %%mm3, 8(%[d1])\n"
-
-      "  movq %%mm4, %%mm0\n"
-      "  paddw 16(%[s2]), %%mm0\n"
-      "  paddw 16(%[s3]), %%mm0\n"
-      "  psraw %%mm5, %%mm0\n"
-      "  movq 16(%[s1]), %%mm1\n"
-      "  psubw %%mm0, %%mm1\n"
-      "  movq %%mm1, 16(%[d1])\n"
-
-      "  movq %%mm4, %%mm2\n"
-      "  paddw 24(%[s2]), %%mm2\n"
-      "  paddw 24(%[s3]), %%mm2\n"
-      "  psraw %%mm5, %%mm2\n"
-      "  movq 24(%[s1]), %%mm3\n"
-      "  psubw %%mm2, %%mm3\n"
-      "  movq %%mm3, 24(%[d1])\n"
-
-      "  add $32, %[d1]\n"
-      "  add $32, %[s1]\n"
-      "  add $32, %[s2]\n"
-      "  add $32, %[s3]\n"
-      "  decl %[n]\n"
-      "  jnz 1b\n"
-      "  emms\n"
-      : [d1] "+r" (d1), [s1] "+r" (s1), [s2] "+r" (s2),
-        [s3] "+r" (s3), [n] "+r" (n)
-      :
-      );
-}
-OIL_DEFINE_IMPL_FULL (add2_rshift_sub_s16_mmx_unroll4, add2_rshift_sub_s16, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT);
-
-
-void
-add2_rshift_add_s16_sse (int16_t *d1, int16_t *s1, int16_t *s2, int16_t *s3,
-    int16_t *s4_2, int n)
-{
-  while (n&7) {
-    int x;
-
-    x = s4_2[0] + s2[0] + s3[0];
-    x >>= s4_2[1];
-    d1[0] = s1[0] + x;
-
-    d1++;
-    s1++;
-    s2++;
-    s3++;
-    n--;
-  }
-  if (n==0) return;
-
-  n>>=3;
-  asm volatile ("\n"
-      "  movd 0(%[s4_2]), %%xmm4\n"
-      "  pshuflw $0x00, %%xmm4, %%xmm4\n"
-      "  pshufd $0x00, %%xmm4, %%xmm4\n"
-      "  movzwl 2(%[s4_2]), %%ecx\n"
-      "  movd %%ecx, %%xmm5\n"
-      :: [s4_2] "r" (s4_2)
-      : "ecx");
-  asm volatile ("\n"
-      "1:\n"
-#if 0
-      "  movdqu %%xmm4, %%xmm0\n"
-      "  movdqu 0(%[s2]), %%xmm1\n"
-      "  paddw %%xmm1, %%xmm0\n"
-      "  movdqu 0(%[s3]), %%xmm2\n"
-      "  paddw %%xmm2, %%xmm0\n"
-      "  psraw %%xmm5, %%xmm0\n"
-      "  movdqu 0(%[d1]), %%xmm1\n"
-      "  paddw %%xmm1, %%xmm0\n"
-      "  movdqu %%xmm0, 0(%[d1])\n"
-#endif
-      "  movdqu %%xmm4, %%xmm0\n"
-      "  movdqu 0(%[s2]), %%xmm1\n"
-      "  paddw %%xmm1, %%xmm0\n"
-      "  movdqu 0(%[s3]), %%xmm2\n"
-      "  paddw %%xmm2, %%xmm0\n"
-      "  psraw %%xmm5, %%xmm0\n"
-      "  movdqu 0(%[s1]), %%xmm1\n"
-      "  paddw %%xmm0, %%xmm1\n"
-      "  movdqu %%xmm1, 0(%[d1])\n"
-
-      "  add $16, %[d1]\n"
-      "  add $16, %[s1]\n"
-      "  add $16, %[s2]\n"
-      "  add $16, %[s3]\n"
-      "  decl %[n]\n"
-      "  jnz 1b\n"
-      : [d1] "+r" (d1), [s1] "+r" (s1), [s2] "+r" (s2),
-        [s3] "+r" (s3), [n] "+r" (n)
-      :
-      );
-}
-OIL_DEFINE_IMPL_FULL (add2_rshift_add_s16_sse, add2_rshift_add_s16, OIL_IMPL_FLAG_MMXEXT|OIL_IMPL_FLAG_SSE|OIL_IMPL_FLAG_SSE2);
-
-void
-add2_rshift_sub_s16_sse (int16_t *d1, int16_t *s1, int16_t *s2, int16_t *s3,
-    int16_t *s4_2, int n)
-{
-  while (n&7) {
-    int x;
-
-    x = s4_2[0] + s2[0] + s3[0];
-    x >>= s4_2[1];
-    d1[0] = s1[0] - x;
-
-    d1++;
-    s1++;
-    s2++;
-    s3++;
-    n--;
-  }
-  if (n==0) return;
-
-  n>>=3;
-  asm volatile ("\n"
-      "  movd 0(%[s4_2]), %%xmm4\n"
-      "  pshuflw $0x00, %%xmm4, %%xmm4\n"
-      "  pshufd $0x00, %%xmm4, %%xmm4\n"
-      "  movzwl 2(%[s4_2]), %%ecx\n"
-      "  movd %%ecx, %%xmm5\n"
-      :: [s4_2] "r" (s4_2)
-      : "ecx");
-  asm volatile ("\n"
-      "1:\n"
-      "  movdqu %%xmm4, %%xmm0\n"
-      "  movdqu 0(%[s2]), %%xmm1\n"
-      "  paddw %%xmm1, %%xmm0\n"
-      "  movdqu 0(%[s3]), %%xmm2\n"
-      "  paddw %%xmm2, %%xmm0\n"
-      "  psraw %%xmm5, %%xmm0\n"
-      "  movdqu 0(%[s1]), %%xmm1\n"
-      "  psubw %%xmm0, %%xmm1\n"
-      "  movdqu %%xmm1, 0(%[d1])\n"
-
-      "  add $16, %[d1]\n"
-      "  add $16, %[s1]\n"
-      "  add $16, %[s2]\n"
-      "  add $16, %[s3]\n"
-      "  decl %[n]\n"
-      "  jnz 1b\n"
-      : [d1] "+r" (d1), [s1] "+r" (s1), [s2] "+r" (s2),
-        [s3] "+r" (s3), [n] "+r" (n)
-      :
-      );
-}
-OIL_DEFINE_IMPL_FULL (add2_rshift_sub_s16_sse, add2_rshift_sub_s16, OIL_IMPL_FLAG_MMXEXT|OIL_IMPL_FLAG_SSE|OIL_IMPL_FLAG_SSE2);
-
diff --git a/liboil/i386_amd64/Makefile.am b/liboil/i386_amd64/Makefile.am
index b7349bb..b44dae1 100644
--- a/liboil/i386_amd64/Makefile.am
+++ b/liboil/i386_amd64/Makefile.am
@@ -2,12 +2,15 @@
 noinst_LTLIBRARIES = libi386_amd64.la
 
 libi386_amd64_la_SOURCES = \
+	add2.c \
 	addc.c \
 	clamp.c \
 	convert.c \
 	copy.c \
 	idct8x8_i386.c \
+	mas.c \
 	math.c \
+	multiply_and_acc.c \
 	mt19937.c \
 	resample.c \
 	sad8x8.c \
diff --git a/liboil/i386_amd64/add2.c b/liboil/i386_amd64/add2.c
new file mode 100644
index 0000000..7c4f3ab
--- /dev/null
+++ b/liboil/i386_amd64/add2.c
@@ -0,0 +1,488 @@
+
+#include <liboil/liboilfunction.h>
+#include <liboil/liboilclasses.h>
+
+
+
+
+void
+add2_rshift_add_s16_mmx (int16_t *d1, int16_t *s1, int16_t *s2, int16_t *s3,
+    int16_t *s4_2, int n)
+{
+  while (n&3) {
+    int x;
+
+    x = s4_2[0] + s2[0] + s3[0];
+    x >>= s4_2[1];
+    d1[0] = s1[0] + x;
+
+    d1++;
+    s1++;
+    s2++;
+    s3++;
+    n--;
+  }
+  if (n==0) return;
+
+  n>>=2;
+  asm volatile ("\n"
+      "  movd 0(%[s4_2]), %%mm4\n"
+      "  pshufw $0x00, %%mm4, %%mm4\n"
+      "  movzwl 2(%[s4_2]), %%ecx\n"
+      "  movd %%ecx, %%mm5\n"
+      :: [s4_2] "r" (s4_2)
+      : "ecx");
+  asm volatile ("\n"
+      "1:\n"
+      "  movq %%mm4, %%mm0\n"
+      "  paddw 0(%[s2]), %%mm0\n"
+      "  paddw 0(%[s3]), %%mm0\n"
+      "  psraw %%mm5, %%mm0\n"
+      "  paddw 0(%[s1]), %%mm0\n"
+      "  movq %%mm0, 0(%[d1])\n"
+
+      "  add $8, %[d1]\n"
+      "  add $8, %[s1]\n"
+      "  add $8, %[s2]\n"
+      "  add $8, %[s3]\n"
+      "  decl %[n]\n"
+      "  jnz 1b\n"
+      "  emms\n"
+      : [d1] "+r" (d1), [s1] "+r" (s1), [s2] "+r" (s2),
+        [s3] "+r" (s3), [n] "+r" (n)
+      :
+      );
+}
+OIL_DEFINE_IMPL_FULL (add2_rshift_add_s16_mmx, add2_rshift_add_s16, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT);
+
+void
+add2_rshift_sub_s16_mmx (int16_t *d1, int16_t *s1, int16_t *s2, int16_t *s3,
+    int16_t *s4_2, int n)
+{
+  while (n&3) {
+    int x;
+
+    x = s4_2[0] + s2[0] + s3[0];
+    x >>= s4_2[1];
+    d1[0] = s1[0] - x;
+
+    d1++;
+    s1++;
+    s2++;
+    s3++;
+    n--;
+  }
+  if (n==0) return;
+
+  n>>=2;
+  asm volatile ("\n"
+      "  movd 0(%[s4_2]), %%mm4\n"
+      "  pshufw $0x00, %%mm4, %%mm4\n"
+      "  movzwl 2(%[s4_2]), %%ecx\n"
+      "  movd %%ecx, %%mm5\n"
+      :: [s4_2] "r" (s4_2)
+      : "ecx");
+  asm volatile ("\n"
+      "1:\n"
+      "  movq %%mm4, %%mm0\n"
+      "  paddw 0(%[s2]), %%mm0\n"
+      "  paddw 0(%[s3]), %%mm0\n"
+      "  psraw %%mm5, %%mm0\n"
+      "  movq 0(%[s1]), %%mm1\n"
+      "  psubw %%mm0, %%mm1\n"
+      "  movq %%mm1, 0(%[d1])\n"
+
+      "  add $8, %[d1]\n"
+      "  add $8, %[s1]\n"
+      "  add $8, %[s2]\n"
+      "  add $8, %[s3]\n"
+      "  decl %[n]\n"
+      "  jnz 1b\n"
+      "  emms\n"
+      : [d1] "+r" (d1), [s1] "+r" (s1), [s2] "+r" (s2),
+        [s3] "+r" (s3), [n] "+r" (n)
+      :
+      );
+}
+OIL_DEFINE_IMPL_FULL (add2_rshift_sub_s16_mmx, add2_rshift_sub_s16, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT);
+
+void
+add2_rshift_add_s16_mmx_unroll2 (int16_t *d1, int16_t *s1, int16_t *s2, int16_t *s3,
+    int16_t *s4_2, int n)
+{
+  while (n&7) {
+    int x;
+
+    x = s4_2[0] + s2[0] + s3[0];
+    x >>= s4_2[1];
+    d1[0] = s1[0] + x;
+
+    d1++;
+    s1++;
+    s2++;
+    s3++;
+    n--;
+  }
+  if (n==0) return;
+
+  n>>=3;
+  asm volatile ("\n"
+      "  movd 0(%[s4_2]), %%mm4\n"
+      "  pshufw $0x00, %%mm4, %%mm4\n"
+      "  movzwl 2(%[s4_2]), %%ecx\n"
+      "  movd %%ecx, %%mm5\n"
+      :: [s4_2] "r" (s4_2)
+      : "ecx");
+  asm volatile ("\n"
+      "1:\n"
+      "  movq %%mm4, %%mm0\n"
+      "  paddw 0(%[s2]), %%mm0\n"
+      "  paddw 0(%[s3]), %%mm0\n"
+      "  psraw %%mm5, %%mm0\n"
+      "  paddw 0(%[s1]), %%mm0\n"
+      "  movq %%mm0, 0(%[d1])\n"
+
+      "  movq %%mm4, %%mm0\n"
+      "  paddw 8(%[s2]), %%mm0\n"
+      "  paddw 8(%[s3]), %%mm0\n"
+      "  psraw %%mm5, %%mm0\n"
+      "  paddw 8(%[s1]), %%mm0\n"
+      "  movq %%mm0, 8(%[d1])\n"
+
+      "  add $16, %[d1]\n"
+      "  add $16, %[s1]\n"
+      "  add $16, %[s2]\n"
+      "  add $16, %[s3]\n"
+      "  decl %[n]\n"
+      "  jnz 1b\n"
+      "  emms\n"
+      : [d1] "+r" (d1), [s1] "+r" (s1), [s2] "+r" (s2),
+        [s3] "+r" (s3), [n] "+r" (n)
+      :
+      );
+}
+OIL_DEFINE_IMPL_FULL (add2_rshift_add_s16_mmx_unroll2, add2_rshift_add_s16, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT);
+
+void
+add2_rshift_sub_s16_mmx_unroll2 (int16_t *d1, int16_t *s1, int16_t *s2, int16_t *s3,
+    int16_t *s4_2, int n)
+{
+  while (n&7) {
+    int x;
+
+    x = s4_2[0] + s2[0] + s3[0];
+    x >>= s4_2[1];
+    d1[0] = s1[0] - x;
+
+    d1++;
+    s1++;
+    s2++;
+    s3++;
+    n--;
+  }
+  if (n==0) return;
+
+  n>>=3;
+  asm volatile ("\n"
+      "  movd 0(%[s4_2]), %%mm4\n"
+      "  pshufw $0x00, %%mm4, %%mm4\n"
+      "  movzwl 2(%[s4_2]), %%ecx\n"
+      "  movd %%ecx, %%mm5\n"
+      :: [s4_2] "r" (s4_2)
+      : "ecx");
+  asm volatile ("\n"
+      "1:\n"
+      "  movq %%mm4, %%mm0\n"
+      "  paddw 0(%[s2]), %%mm0\n"
+      "  paddw 0(%[s3]), %%mm0\n"
+      "  psraw %%mm5, %%mm0\n"
+      "  movq 0(%[s1]), %%mm1\n"
+      "  psubw %%mm0, %%mm1\n"
+      "  movq %%mm1, 0(%[d1])\n"
+
+      "  movq %%mm4, %%mm0\n"
+      "  paddw 8(%[s2]), %%mm0\n"
+      "  paddw 8(%[s3]), %%mm0\n"
+      "  psraw %%mm5, %%mm0\n"
+      "  movq 8(%[s1]), %%mm1\n"
+      "  psubw %%mm0, %%mm1\n"
+      "  movq %%mm1, 8(%[d1])\n"
+
+      "  add $16, %[d1]\n"
+      "  add $16, %[s1]\n"
+      "  add $16, %[s2]\n"
+      "  add $16, %[s3]\n"
+      "  decl %[n]\n"
+      "  jnz 1b\n"
+      "  emms\n"
+      : [d1] "+r" (d1), [s1] "+r" (s1), [s2] "+r" (s2),
+        [s3] "+r" (s3), [n] "+r" (n)
+      :
+      );
+}
+OIL_DEFINE_IMPL_FULL (add2_rshift_sub_s16_mmx_unroll2, add2_rshift_sub_s16, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT);
+
+void
+add2_rshift_add_s16_mmx_unroll4 (int16_t *d1, int16_t *s1, int16_t *s2, int16_t *s3,
+    int16_t *s4_2, int n)
+{
+  while (n&15) {
+    int x;
+
+    x = s4_2[0] + s2[0] + s3[0];
+    x >>= s4_2[1];
+    d1[0] = s1[0] + x;
+
+    d1++;
+    s1++;
+    s2++;
+    s3++;
+    n--;
+  }
+  if (n==0) return;
+
+  n>>=4;
+  asm volatile ("\n"
+      "  movd 0(%[s4_2]), %%mm4\n"
+      "  pshufw $0x00, %%mm4, %%mm4\n"
+      "  movzwl 2(%[s4_2]), %%ecx\n"
+      "  movd %%ecx, %%mm5\n"
+      :: [s4_2] "r" (s4_2)
+      : "ecx");
+  asm volatile ("\n"
+      "1:\n"
+      "  movq %%mm4, %%mm0\n"
+      "  paddw 0(%[s2]), %%mm0\n"
+      "  paddw 0(%[s3]), %%mm0\n"
+      "  psraw %%mm5, %%mm0\n"
+      "  paddw 0(%[s1]), %%mm0\n"
+      "  movq %%mm0, 0(%[d1])\n"
+
+      "  movq %%mm4, %%mm0\n"
+      "  paddw 8(%[s2]), %%mm0\n"
+      "  paddw 8(%[s3]), %%mm0\n"
+      "  psraw %%mm5, %%mm0\n"
+      "  paddw 8(%[s1]), %%mm0\n"
+      "  movq %%mm0, 8(%[d1])\n"
+
+      "  movq %%mm4, %%mm0\n"
+      "  paddw 16(%[s2]), %%mm0\n"
+      "  paddw 16(%[s3]), %%mm0\n"
+      "  psraw %%mm5, %%mm0\n"
+      "  paddw 16(%[s1]), %%mm0\n"
+      "  movq %%mm0, 16(%[d1])\n"
+
+      "  movq %%mm4, %%mm0\n"
+      "  paddw 24(%[s2]), %%mm0\n"
+      "  paddw 24(%[s3]), %%mm0\n"
+      "  psraw %%mm5, %%mm0\n"
+      "  paddw 24(%[s1]), %%mm0\n"
+      "  movq %%mm0, 24(%[d1])\n"
+
+      "  add $32, %[d1]\n"
+      "  add $32, %[s1]\n"
+      "  add $32, %[s2]\n"
+      "  add $32, %[s3]\n"
+      "  decl %[n]\n"
+      "  jnz 1b\n"
+      "  emms\n"
+      : [d1] "+r" (d1), [s1] "+r" (s1), [s2] "+r" (s2),
+        [s3] "+r" (s3), [n] "+r" (n)
+      :
+      );
+}
+OIL_DEFINE_IMPL_FULL (add2_rshift_add_s16_mmx_unroll4, add2_rshift_add_s16, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT);
+
+void
+add2_rshift_sub_s16_mmx_unroll4 (int16_t *d1, int16_t *s1, int16_t *s2, int16_t *s3,
+    int16_t *s4_2, int n)
+{
+  while (n&15) {
+    int x;
+
+    x = s4_2[0] + s2[0] + s3[0];
+    x >>= s4_2[1];
+    d1[0] = s1[0] - x;
+
+    d1++;
+    s1++;
+    s2++;
+    s3++;
+    n--;
+  }
+  if (n==0) return;
+
+  n>>=4;
+  asm volatile ("\n"
+      "  movd 0(%[s4_2]), %%mm4\n"
+      "  pshufw $0x00, %%mm4, %%mm4\n"
+      "  movzwl 2(%[s4_2]), %%ecx\n"
+      "  movd %%ecx, %%mm5\n"
+      :: [s4_2] "r" (s4_2)
+      : "ecx");
+  asm volatile ("\n"
+      "1:\n"
+      "  movq %%mm4, %%mm0\n"
+      "  paddw 0(%[s2]), %%mm0\n"
+      "  paddw 0(%[s3]), %%mm0\n"
+      "  psraw %%mm5, %%mm0\n"
+      "  movq 0(%[s1]), %%mm1\n"
+      "  psubw %%mm0, %%mm1\n"
+      "  movq %%mm1, 0(%[d1])\n"
+
+      "  movq %%mm4, %%mm2\n"
+      "  paddw 8(%[s2]), %%mm2\n"
+      "  paddw 8(%[s3]), %%mm2\n"
+      "  psraw %%mm5, %%mm2\n"
+      "  movq 8(%[s1]), %%mm3\n"
+      "  psubw %%mm2, %%mm3\n"
+      "  movq %%mm3, 8(%[d1])\n"
+
+      "  movq %%mm4, %%mm0\n"
+      "  paddw 16(%[s2]), %%mm0\n"
+      "  paddw 16(%[s3]), %%mm0\n"
+      "  psraw %%mm5, %%mm0\n"
+      "  movq 16(%[s1]), %%mm1\n"
+      "  psubw %%mm0, %%mm1\n"
+      "  movq %%mm1, 16(%[d1])\n"
+
+      "  movq %%mm4, %%mm2\n"
+      "  paddw 24(%[s2]), %%mm2\n"
+      "  paddw 24(%[s3]), %%mm2\n"
+      "  psraw %%mm5, %%mm2\n"
+      "  movq 24(%[s1]), %%mm3\n"
+      "  psubw %%mm2, %%mm3\n"
+      "  movq %%mm3, 24(%[d1])\n"
+
+      "  add $32, %[d1]\n"
+      "  add $32, %[s1]\n"
+      "  add $32, %[s2]\n"
+      "  add $32, %[s3]\n"
+      "  decl %[n]\n"
+      "  jnz 1b\n"
+      "  emms\n"
+      : [d1] "+r" (d1), [s1] "+r" (s1), [s2] "+r" (s2),
+        [s3] "+r" (s3), [n] "+r" (n)
+      :
+      );
+}
+OIL_DEFINE_IMPL_FULL (add2_rshift_sub_s16_mmx_unroll4, add2_rshift_sub_s16, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT);
+
+
+void
+add2_rshift_add_s16_sse (int16_t *d1, int16_t *s1, int16_t *s2, int16_t *s3,
+    int16_t *s4_2, int n)
+{
+  while (n&7) {
+    int x;
+
+    x = s4_2[0] + s2[0] + s3[0];
+    x >>= s4_2[1];
+    d1[0] = s1[0] + x;
+
+    d1++;
+    s1++;
+    s2++;
+    s3++;
+    n--;
+  }
+  if (n==0) return;
+
+  n>>=3;
+  asm volatile ("\n"
+      "  movd 0(%[s4_2]), %%xmm4\n"
+      "  pshuflw $0x00, %%xmm4, %%xmm4\n"
+      "  pshufd $0x00, %%xmm4, %%xmm4\n"
+      "  movzwl 2(%[s4_2]), %%ecx\n"
+      "  movd %%ecx, %%xmm5\n"
+      :: [s4_2] "r" (s4_2)
+      : "ecx");
+  asm volatile ("\n"
+      "1:\n"
+#if 0
+      "  movdqu %%xmm4, %%xmm0\n"
+      "  movdqu 0(%[s2]), %%xmm1\n"
+      "  paddw %%xmm1, %%xmm0\n"
+      "  movdqu 0(%[s3]), %%xmm2\n"
+      "  paddw %%xmm2, %%xmm0\n"
+      "  psraw %%xmm5, %%xmm0\n"
+      "  movdqu 0(%[d1]), %%xmm1\n"
+      "  paddw %%xmm1, %%xmm0\n"
+      "  movdqu %%xmm0, 0(%[d1])\n"
+#endif
+      "  movdqu %%xmm4, %%xmm0\n"
+      "  movdqu 0(%[s2]), %%xmm1\n"
+      "  paddw %%xmm1, %%xmm0\n"
+      "  movdqu 0(%[s3]), %%xmm2\n"
+      "  paddw %%xmm2, %%xmm0\n"
+      "  psraw %%xmm5, %%xmm0\n"
+      "  movdqu 0(%[s1]), %%xmm1\n"
+      "  paddw %%xmm0, %%xmm1\n"
+      "  movdqu %%xmm1, 0(%[d1])\n"
+
+      "  add $16, %[d1]\n"
+      "  add $16, %[s1]\n"
+      "  add $16, %[s2]\n"
+      "  add $16, %[s3]\n"
+      "  decl %[n]\n"
+      "  jnz 1b\n"
+      : [d1] "+r" (d1), [s1] "+r" (s1), [s2] "+r" (s2),
+        [s3] "+r" (s3), [n] "+r" (n)
+      :
+      );
+}
+OIL_DEFINE_IMPL_FULL (add2_rshift_add_s16_sse, add2_rshift_add_s16, OIL_IMPL_FLAG_MMXEXT|OIL_IMPL_FLAG_SSE|OIL_IMPL_FLAG_SSE2);
+
+void
+add2_rshift_sub_s16_sse (int16_t *d1, int16_t *s1, int16_t *s2, int16_t *s3,
+    int16_t *s4_2, int n)
+{
+  while (n&7) {
+    int x;
+
+    x = s4_2[0] + s2[0] + s3[0];
+    x >>= s4_2[1];
+    d1[0] = s1[0] - x;
+
+    d1++;
+    s1++;
+    s2++;
+    s3++;
+    n--;
+  }
+  if (n==0) return;
+
+  n>>=3;
+  asm volatile ("\n"
+      "  movd 0(%[s4_2]), %%xmm4\n"
+      "  pshuflw $0x00, %%xmm4, %%xmm4\n"
+      "  pshufd $0x00, %%xmm4, %%xmm4\n"
+      "  movzwl 2(%[s4_2]), %%ecx\n"
+      "  movd %%ecx, %%xmm5\n"
+      :: [s4_2] "r" (s4_2)
+      : "ecx");
+  asm volatile ("\n"
+      "1:\n"
+      "  movdqu %%xmm4, %%xmm0\n"
+      "  movdqu 0(%[s2]), %%xmm1\n"
+      "  paddw %%xmm1, %%xmm0\n"
+      "  movdqu 0(%[s3]), %%xmm2\n"
+      "  paddw %%xmm2, %%xmm0\n"
+      "  psraw %%xmm5, %%xmm0\n"
+      "  movdqu 0(%[s1]), %%xmm1\n"
+      "  psubw %%xmm0, %%xmm1\n"
+      "  movdqu %%xmm1, 0(%[d1])\n"
+
+      "  add $16, %[d1]\n"
+      "  add $16, %[s1]\n"
+      "  add $16, %[s2]\n"
+      "  add $16, %[s3]\n"
+      "  decl %[n]\n"
+      "  jnz 1b\n"
+      : [d1] "+r" (d1), [s1] "+r" (s1), [s2] "+r" (s2),
+        [s3] "+r" (s3), [n] "+r" (n)
+      :
+      );
+}
+OIL_DEFINE_IMPL_FULL (add2_rshift_sub_s16_sse, add2_rshift_sub_s16, OIL_IMPL_FLAG_MMXEXT|OIL_IMPL_FLAG_SSE|OIL_IMPL_FLAG_SSE2);
+
diff --git a/liboil/i386_amd64/mas.c b/liboil/i386_amd64/mas.c
new file mode 100644
index 0000000..20d4d2f
--- /dev/null
+++ b/liboil/i386_amd64/mas.c
@@ -0,0 +1,1114 @@
+
+#include <liboil/liboilfunction.h>
+#include <liboil/liboilclasses.h>
+
+
+void
+mas10_u8_mmx (uint8_t *d, const uint8_t *s1_np9, const int16_t *s2_10,
+        const int16_t *s3_2, int n)
+{
+  int j;
+  int x;
+
+  while(n&3) {
+    x = 0;
+    for(j=0;j<10;j++){
+      x += s1_np9[j] * s2_10[j];
+    }
+    *d = CLAMP((x + s3_2[0])>>s3_2[1],0,255);
+    d++;
+    s1_np9++;
+    n--;
+  }
+
+  if (n == 0) return;
+  n>>=2;
+  __asm__ __volatile__("\n"
+      "  pxor %%mm7, %%mm7\n"
+
+      "  movd (%[s3_2]), %%mm6\n"
+
+      "  movzwl 2(%[s3_2]), %%ecx\n"
+      "  movd %%ecx, %%mm5\n"
+
+      "1:\n"
+      /* load 128 */
+      "  pshufw $0x00, %%mm6, %%mm2\n"
+
+#define LOOP(x) \
+      "  movd " #x "(%[s1_np9]), %%mm0\n" \
+      "  punpcklbw %%mm7, %%mm0\n" \
+      "  movq 2*" #x "(%[s2_10]), %%mm1\n" \
+      "  pshufw $0x00, %%mm1, %%mm1\n" \
+      "  pmullw %%mm1, %%mm0\n" \
+      "  paddw %%mm0, %%mm2\n"
+
+      LOOP(0)
+      LOOP(1)
+      LOOP(2)
+      LOOP(3)
+      LOOP(4)
+      LOOP(5)
+      LOOP(6)
+      LOOP(7)
+      LOOP(8)
+      LOOP(9)
+#undef LOOP
+
+      "  psraw %%mm5, %%mm2\n"
+      "  pmaxsw %%mm7, %%mm2\n"
+      "  packuswb %%mm2, %%mm2\n"
+      "  movd %%mm2, 0(%[d])\n"
+      "  add $4, %[d]\n"
+      "  add $4, %[s1_np9]\n"
+      "  decl %[n]\n"
+      "  jnz 1b\n"
+      "  emms\n"
+      : [d] "+r" (d),
+        [s1_np9] "+r" (s1_np9),
+        [n] "+m" (n)
+      : [s2_10] "r" (s2_10),
+        [s3_2] "r" (s3_2)
+      : "ecx");
+}
+OIL_DEFINE_IMPL_FULL (mas10_u8_mmx, mas10_u8_l15, OIL_IMPL_FLAG_MMX|OIL_IMPL_FLAG_MMXEXT);
+
+void
+mas10_u8_mmx_2 (uint8_t *d, const uint8_t *s1_np9, const int16_t *s2_10,
+        const int16_t *s3_2, int n)
+{
+  int j;
+  int x;
+  int16_t coeff[4*10];
+  int16_t *ptr;
+
+  ptr = coeff;
+
+  while(n&3) {
+    x = 0;
+    for(j=0;j<10;j++){
+      x += s1_np9[j] * s2_10[j];
+    }
+    *d = CLAMP((x + s3_2[0])>>s3_2[1],0,255);
+    d++;
+    s1_np9++;
+    n--;
+  }
+
+  for(j=0;j<10;j++){
+    ptr[4*j + 0] = s2_10[j];
+    ptr[4*j + 1] = s2_10[j];
+    ptr[4*j + 2] = s2_10[j];
+    ptr[4*j + 3] = s2_10[j];
+  }
+
+  if (n == 0) return;
+  n>>=2;
+  __asm__ __volatile__("\n"
+      "  pxor %%mm7, %%mm7\n"
+
+      "  movd (%[s3_2]), %%mm6\n"
+
+      "  movzwl 2(%[s3_2]), %%ecx\n"
+      "  movd %%ecx, %%mm5\n"
+
+      "1:\n"
+      /* load 128 */
+      "  pshufw $0x00, %%mm6, %%mm2\n"
+
+#define LOOP(x) \
+      "  movd " #x "(%[s1_np9]), %%mm0\n" \
+      "  punpcklbw %%mm7, %%mm0\n" \
+      "  pmullw 8*" #x "(%[coeff]), %%mm0\n" \
+      "  paddw %%mm0, %%mm2\n"
+
+      LOOP(0)
+      LOOP(1)
+      LOOP(2)
+      LOOP(3)
+      LOOP(4)
+      LOOP(5)
+      LOOP(6)
+      LOOP(7)
+      LOOP(8)
+      LOOP(9)
+#undef LOOP
+
+      "  psraw %%mm5, %%mm2\n"
+      "  pmaxsw %%mm7, %%mm2\n"
+      "  packuswb %%mm2, %%mm2\n"
+      "  movd %%mm2, 0(%[d])\n"
+      "  add $4, %[d]\n"
+      "  add $4, %[s1_np9]\n"
+      "  decl %[n]\n"
+      "  jnz 1b\n"
+      "  emms\n"
+      : [d] "+r" (d),
+        [s1_np9] "+r" (s1_np9),
+        [n] "+m" (n)
+      : [coeff] "r" (ptr),
+        [s3_2] "r" (s3_2)
+      : "ecx");
+}
+OIL_DEFINE_IMPL_FULL (mas10_u8_mmx_2, mas10_u8_l15, OIL_IMPL_FLAG_MMX|OIL_IMPL_FLAG_MMXEXT);
+
+void
+mas10_u8_mmx_3 (uint8_t *d, const uint8_t *s1_np9, const int16_t *s2_10,
+        const int16_t *s3_2, int n)
+{
+  int j;
+  int x;
+
+  while(n&3) {
+    x = 0;
+    for(j=0;j<10;j++){
+      x += s1_np9[j] * s2_10[j];
+    }
+    *d = CLAMP((x + s3_2[0])>>s3_2[1],0,255);
+    d++;
+    s1_np9++;
+    n--;
+  }
+
+  if (n == 0) return;
+  n>>=2;
+  __asm__ __volatile__("\n"
+      "  pxor %%mm7, %%mm7\n"
+
+      "  movd (%[s3_2]), %%mm6\n"
+
+      "  movzwl 2(%[s3_2]), %%ecx\n"
+      "  movd %%ecx, %%mm5\n"
+
+      "  movq 0(%[s2_10]), %%mm3\n"
+      "  movq 8(%[s2_10]), %%mm4\n"
+
+      "1:\n"
+      /* load 128 */
+      "  pshufw $0x00, %%mm6, %%mm2\n"
+
+#define LOOP(x) \
+      "  movd " #x "(%[s1_np9]), %%mm0\n" \
+      "  punpcklbw %%mm7, %%mm0\n" \
+      "  movq 2*" #x "(%[s2_10]), %%mm1\n" \
+      "  pshufw $0x00, %%mm1, %%mm1\n" \
+      "  pmullw %%mm1, %%mm0\n" \
+      "  paddw %%mm0, %%mm2\n"
+
+      //LOOP(0)
+      "  movd 0(%[s1_np9]), %%mm0\n"
+      "  punpcklbw %%mm7, %%mm0\n"
+      "  pshufw $0x00, %%mm3, %%mm1\n"
+      "  pmullw %%mm1, %%mm0\n"
+      "  paddw %%mm0, %%mm2\n"
+
+      //LOOP(1)
+      "  movd 1(%[s1_np9]), %%mm0\n"
+      "  punpcklbw %%mm7, %%mm0\n"
+      "  pshufw $0x55*1, %%mm3, %%mm1\n"
+      "  pmullw %%mm1, %%mm0\n"
+      "  paddw %%mm0, %%mm2\n"
+
+      //LOOP(2)
+      "  movd 2(%[s1_np9]), %%mm0\n"
+      "  punpcklbw %%mm7, %%mm0\n"
+      "  pshufw $0x55*2, %%mm3, %%mm1\n"
+      "  pmullw %%mm1, %%mm0\n"
+      "  paddw %%mm0, %%mm2\n"
+
+      //LOOP(3)
+      "  movd 3(%[s1_np9]), %%mm0\n"
+      "  punpcklbw %%mm7, %%mm0\n"
+      "  pshufw $0x55*3, %%mm3, %%mm1\n"
+      "  pmullw %%mm1, %%mm0\n"
+      "  paddw %%mm0, %%mm2\n"
+
+      //LOOP(4)
+      "  movd 4(%[s1_np9]), %%mm0\n"
+      "  punpcklbw %%mm7, %%mm0\n"
+      "  pshufw $0x00, %%mm4, %%mm1\n"
+      "  pmullw %%mm1, %%mm0\n"
+      "  paddw %%mm0, %%mm2\n"
+
+      //LOOP(5)
+      "  movd 5(%[s1_np9]), %%mm0\n"
+      "  punpcklbw %%mm7, %%mm0\n"
+      "  pshufw $0x55*1, %%mm4, %%mm1\n"
+      "  pmullw %%mm1, %%mm0\n"
+      "  paddw %%mm0, %%mm2\n"
+
+      //LOOP(6)
+      "  movd 6(%[s1_np9]), %%mm0\n"
+      "  punpcklbw %%mm7, %%mm0\n"
+      "  pshufw $0x55*2, %%mm4, %%mm1\n"
+      "  pmullw %%mm1, %%mm0\n"
+      "  paddw %%mm0, %%mm2\n"
+
+      //LOOP(7)
+      "  movd 7(%[s1_np9]), %%mm0\n"
+      "  punpcklbw %%mm7, %%mm0\n"
+      "  pshufw $0x55*3, %%mm4, %%mm1\n"
+      "  pmullw %%mm1, %%mm0\n"
+      "  paddw %%mm0, %%mm2\n"
+
+      LOOP(8)
+      LOOP(9)
+#undef LOOP
+
+      "  psraw %%mm5, %%mm2\n"
+      "  pmaxsw %%mm7, %%mm2\n"
+      "  packuswb %%mm2, %%mm2\n"
+      "  movd %%mm2, 0(%[d])\n"
+      "  add $4, %[d]\n"
+      "  add $4, %[s1_np9]\n"
+      "  decl %[n]\n"
+      "  jnz 1b\n"
+      "  emms\n"
+      : [d] "+r" (d),
+        [s1_np9] "+r" (s1_np9),
+        [n] "+m" (n)
+      : [s2_10] "r" (s2_10),
+        [s3_2] "r" (s3_2)
+      : "ecx");
+}
+OIL_DEFINE_IMPL_FULL (mas10_u8_mmx_3, mas10_u8_l15, OIL_IMPL_FLAG_MMX|OIL_IMPL_FLAG_MMXEXT);
+
+void
+mas10_u8_mmx_4 (uint8_t *d, const uint8_t *s1_np9, const int16_t *s2_10,
+        const int16_t *s3_2, int n)
+{
+  if (n == 0) return;
+  __asm__ __volatile__("\n"
+      "  pxor %%mm7, %%mm7\n"
+
+      "  movzwl 0(%[s3_2]), %%ecx\n"
+      "  movd %%ecx, %%mm6\n"
+
+      "  movzwl 2(%[s3_2]), %%ecx\n"
+      "  movd %%ecx, %%mm5\n"
+
+      "1:\n"
+      "  movd 0(%[s1_np9]), %%mm0\n"
+      "  punpcklbw %%mm7, %%mm0\n"
+      "  pmaddwd 0(%[s2_10]), %%mm0\n"
+
+      "  movd 4(%[s1_np9]), %%mm1\n"
+      "  punpcklbw %%mm7, %%mm1\n"
+      "  pmaddwd 8(%[s2_10]), %%mm1\n"
+
+      "  movd 8(%[s1_np9]), %%mm2\n"
+      "  punpcklbw %%mm7, %%mm2\n"
+      "  pmaddwd 16(%[s2_10]), %%mm2\n"
+
+      "  paddd %%mm1, %%mm0\n"
+      "  movq %%mm0, %%mm1\n"
+      "  psrlq $32, %%mm0\n"
+      "  paddd %%mm1, %%mm0\n"
+      "  paddd %%mm2, %%mm0\n"
+      "  paddd %%mm6, %%mm0\n"
+
+      "  psrad %%mm5, %%mm0\n"
+      "  pmaxsw %%mm7, %%mm0\n"
+      "  packuswb %%mm0, %%mm0\n"
+      "  movd %%mm0, %%ecx\n"
+      "  movb %%cl,0(%[d])\n"
+
+      "  add $1, %[d]\n"
+      "  add $1, %[s1_np9]\n"
+      "  decl %[n]\n"
+      "  jnz 1b\n"
+      "  emms\n"
+      : [d] "+r" (d),
+        [s1_np9] "+r" (s1_np9),
+        [n] "+m" (n)
+      : [s2_10] "r" (s2_10),
+        [s3_2] "r" (s3_2)
+      : "ecx");
+}
+OIL_DEFINE_IMPL_FULL (mas10_u8_mmx_4, mas10_u8, OIL_IMPL_FLAG_MMX|OIL_IMPL_FLAG_MMXEXT);
+
+
+void
+mas8_u8_mmx_3 (uint8_t *d, const uint8_t *s1_np7, const int16_t *s2_8,
+        const int16_t *s3_2, int n)
+{
+  int j;
+  int x;
+
+  while(n&3) {
+    x = 0;
+    for(j=0;j<8;j++){
+      x += s1_np7[j] * s2_8[j];
+    }
+    *d = CLAMP((x + s3_2[0])>>s3_2[1],0,255);
+    d++;
+    s1_np7++;
+    n--;
+  }
+
+  if (n == 0) return;
+  n>>=2;
+  __asm__ __volatile__("\n"
+      "  pxor %%mm7, %%mm7\n"
+
+      "  movd (%[s3_2]), %%mm6\n"
+
+      "  movzwl 2(%[s3_2]), %%ecx\n"
+      "  movd %%ecx, %%mm5\n"
+
+      "  movq 0(%[s2_8]), %%mm3\n"
+      "  movq 8(%[s2_8]), %%mm4\n"
+
+      "1:\n"
+      /* load 128 */
+      "  pshufw $0x00, %%mm6, %%mm2\n"
+
+      "  movd 0(%[s1_np7]), %%mm0\n"
+      "  punpcklbw %%mm7, %%mm0\n"
+      "  pshufw $0x00, %%mm3, %%mm1\n"
+      "  pmullw %%mm1, %%mm0\n"
+      "  paddw %%mm0, %%mm2\n"
+
+      "  movd 1(%[s1_np7]), %%mm0\n"
+      "  punpcklbw %%mm7, %%mm0\n"
+      "  pshufw $0x55*1, %%mm3, %%mm1\n"
+      "  pmullw %%mm1, %%mm0\n"
+      "  paddw %%mm0, %%mm2\n"
+
+      "  movd 2(%[s1_np7]), %%mm0\n"
+      "  punpcklbw %%mm7, %%mm0\n"
+      "  pshufw $0x55*2, %%mm3, %%mm1\n"
+      "  pmullw %%mm1, %%mm0\n"
+      "  paddw %%mm0, %%mm2\n"
+
+      "  movd 3(%[s1_np7]), %%mm0\n"
+      "  punpcklbw %%mm7, %%mm0\n"
+      "  pshufw $0x55*3, %%mm3, %%mm1\n"
+      "  pmullw %%mm1, %%mm0\n"
+      "  paddw %%mm0, %%mm2\n"
+
+      "  movd 4(%[s1_np7]), %%mm0\n"
+      "  punpcklbw %%mm7, %%mm0\n"
+      "  pshufw $0x00, %%mm4, %%mm1\n"
+      "  pmullw %%mm1, %%mm0\n"
+      "  paddw %%mm0, %%mm2\n"
+
+      "  movd 5(%[s1_np7]), %%mm0\n"
+      "  punpcklbw %%mm7, %%mm0\n"
+      "  pshufw $0x55*1, %%mm4, %%mm1\n"
+      "  pmullw %%mm1, %%mm0\n"
+      "  paddw %%mm0, %%mm2\n"
+
+      "  movd 6(%[s1_np7]), %%mm0\n"
+      "  punpcklbw %%mm7, %%mm0\n"
+      "  pshufw $0x55*2, %%mm4, %%mm1\n"
+      "  pmullw %%mm1, %%mm0\n"
+      "  paddw %%mm0, %%mm2\n"
+
+      "  movd 7(%[s1_np7]), %%mm0\n"
+      "  punpcklbw %%mm7, %%mm0\n"
+      "  pshufw $0x55*3, %%mm4, %%mm1\n"
+      "  pmullw %%mm1, %%mm0\n"
+      "  paddw %%mm0, %%mm2\n"
+
+      "  psraw %%mm5, %%mm2\n"
+      "  pmaxsw %%mm7, %%mm2\n"
+      "  packuswb %%mm2, %%mm2\n"
+      "  movd %%mm2, 0(%[d])\n"
+      "  add $4, %[d]\n"
+      "  add $4, %[s1_np7]\n"
+      "  decl %[n]\n"
+      "  jnz 1b\n"
+      "  emms\n"
+      : [d] "+r" (d),
+        [s1_np7] "+r" (s1_np7),
+        [n] "+m" (n)
+      : [s2_8] "r" (s2_8),
+        [s3_2] "r" (s3_2)
+      : "ecx");
+}
+OIL_DEFINE_IMPL_FULL (mas8_u8_mmx_3, mas8_u8_l15, OIL_IMPL_FLAG_MMX|OIL_IMPL_FLAG_MMXEXT);
+
+void
+mas8_u8_mmx_4 (uint8_t *d, const uint8_t *s1_np7, const int16_t *s2_8,
+        const int16_t *s3_2, int n)
+{
+  if (n == 0) return;
+  __asm__ __volatile__("\n"
+      "  pxor %%mm7, %%mm7\n"
+
+      "  movzwl 0(%[s3_2]), %%ecx\n"
+      "  movd %%ecx, %%mm6\n"
+
+      "  movzwl 2(%[s3_2]), %%ecx\n"
+      "  movd %%ecx, %%mm5\n"
+
+      "1:\n"
+      "  movd 0(%[s1_np7]), %%mm0\n"
+      "  punpcklbw %%mm7, %%mm0\n"
+      "  pmaddwd 0(%[s2_8]), %%mm0\n"
+
+      "  movd 4(%[s1_np7]), %%mm1\n"
+      "  punpcklbw %%mm7, %%mm1\n"
+      "  pmaddwd 8(%[s2_8]), %%mm1\n"
+
+      "  paddd %%mm1, %%mm0\n"
+      "  movq %%mm0, %%mm1\n"
+      "  psrlq $32, %%mm0\n"
+      "  paddd %%mm1, %%mm0\n"
+      "  paddd %%mm6, %%mm0\n"
+
+      "  psrad %%mm5, %%mm0\n"
+      "  pmaxsw %%mm7, %%mm0\n"
+      "  packuswb %%mm0, %%mm0\n"
+      "  movd %%mm0, %%ecx\n"
+      "  movb %%cl,0(%[d])\n"
+
+      "  add $1, %[d]\n"
+      "  add $1, %[s1_np7]\n"
+      "  decl %[n]\n"
+      "  jnz 1b\n"
+      "  emms\n"
+      : [d] "+r" (d),
+        [s1_np7] "+r" (s1_np7),
+        [n] "+m" (n)
+      : [s2_8] "r" (s2_8),
+        [s3_2] "r" (s3_2)
+      : "ecx");
+}
+OIL_DEFINE_IMPL_FULL (mas8_u8_mmx_4, mas8_u8, OIL_IMPL_FLAG_MMX|OIL_IMPL_FLAG_MMXEXT);
+
+void
+mas8_u8_sym_mmx_3 (uint8_t *d, const uint8_t *s1_np7, const int16_t *s2_8,
+        const int16_t *s3_2, int n)
+{
+  int j;
+  int x;
+
+  while(n&3) {
+    x = 0;
+    for(j=0;j<8;j++){
+      x += s1_np7[j] * s2_8[j];
+    }
+    *d = CLAMP((x + s3_2[0])>>s3_2[1],0,255);
+    d++;
+    s1_np7++;
+    n--;
+  }
+
+  if (n == 0) return;
+  n>>=2;
+  __asm__ __volatile__("\n"
+      "  pxor %%mm7, %%mm7\n"
+
+      "  movd (%[s3_2]), %%mm6\n"
+
+      "  movzwl 2(%[s3_2]), %%ecx\n"
+      "  movd %%ecx, %%mm5\n"
+
+      "  movq 0(%[s2_8]), %%mm3\n"
+      "  movq 8(%[s2_8]), %%mm4\n"
+
+      " .p2align 4,,15                  \n"
+      "1:\n"
+      /* load 128 */
+      "  pshufw $0x00, %%mm6, %%mm2\n"
+
+      "  movd 0(%[s1_np7]), %%mm0\n"
+      "  punpcklbw %%mm7, %%mm0\n"
+      "  movd 7(%[s1_np7]), %%mm1\n"
+      "  punpcklbw %%mm7, %%mm1\n"
+      "  paddw %%mm1, %%mm0\n"
+      //"  pshufw $0x00, %%mm3, %%mm1\n"
+      //"  pmullw %%mm1, %%mm0\n"
+      //"  paddw %%mm0, %%mm2\n"
+      "  psubw %%mm0, %%mm2\n"
+
+      "  movd 1(%[s1_np7]), %%mm0\n"
+      "  punpcklbw %%mm7, %%mm0\n"
+      "  movd 6(%[s1_np7]), %%mm1\n"
+      "  punpcklbw %%mm7, %%mm1\n"
+      "  paddw %%mm1, %%mm0\n"
+      "  pshufw $0x55*1, %%mm3, %%mm1\n"
+      "  pmullw %%mm1, %%mm0\n"
+      "  paddw %%mm0, %%mm2\n"
+
+      "  movd 2(%[s1_np7]), %%mm0\n"
+      "  punpcklbw %%mm7, %%mm0\n"
+      "  movd 5(%[s1_np7]), %%mm1\n"
+      "  punpcklbw %%mm7, %%mm1\n"
+      "  paddw %%mm1, %%mm0\n"
+      "  pshufw $0x55*2, %%mm3, %%mm1\n"
+      "  pmullw %%mm1, %%mm0\n"
+      "  paddw %%mm0, %%mm2\n"
+
+      "  movd 3(%[s1_np7]), %%mm0\n"
+      "  punpcklbw %%mm7, %%mm0\n"
+      "  movd 4(%[s1_np7]), %%mm1\n"
+      "  punpcklbw %%mm7, %%mm1\n"
+      "  paddw %%mm1, %%mm0\n"
+      "  pshufw $0x55*3, %%mm3, %%mm1\n"
+      "  pmullw %%mm1, %%mm0\n"
+      "  paddw %%mm0, %%mm2\n"
+
+      "  psraw %%mm5, %%mm2\n"
+      "  pmaxsw %%mm7, %%mm2\n"
+      "  packuswb %%mm2, %%mm2\n"
+      "  movd %%mm2, 0(%[d])\n"
+      "  add $4, %[d]\n"
+      "  add $4, %[s1_np7]\n"
+      "  decl %[n]\n"
+      "  jnz 1b\n"
+      "  emms\n"
+      : [d] "+r" (d),
+        [s1_np7] "+r" (s1_np7),
+        [n] "+m" (n)
+      : [s2_8] "r" (s2_8),
+        [s3_2] "r" (s3_2)
+      : "ecx");
+}
+OIL_DEFINE_IMPL_FULL (mas8_u8_sym_mmx_3, mas8_u8_sym_l15, OIL_IMPL_FLAG_MMX|OIL_IMPL_FLAG_MMXEXT);
+
+void
+mas8_u8_sym_mmx_41 (uint8_t *d, const uint8_t *s1_np7, const int16_t *s2_8,
+        const int16_t *s3_2, int n)
+{
+  int j;
+  int x;
+  int16_t tmp[16];
+
+  while(n&3) {
+    x = 0;
+    for(j=0;j<8;j++){
+      x += s1_np7[j] * s2_8[j];
+    }
+    *d = CLAMP((x + s3_2[0])>>s3_2[1],0,255);
+    d++;
+    s1_np7++;
+    n--;
+  }
+
+  if (n == 0) return;
+  n>>=2;
+  __asm__ __volatile__("\n"
+      "  pxor %%mm7, %%mm7\n"
+
+      "  movd (%[s3_2]), %%mm6\n"
+
+      "  movzwl 2(%[s3_2]), %%ecx\n"
+      "  movd %%ecx, %%mm5\n"
+
+      "  movq 0(%[s2_8]), %%mm3\n"
+      "  pshufw $0x55*0, %%mm3, %%mm1\n"
+      "  movq %%mm1, 0(%[coeff])\n"
+      "  pshufw $0x55*1, %%mm3, %%mm1\n"
+      "  movq %%mm1, 8(%[coeff])\n"
+      "  pshufw $0x55*2, %%mm3, %%mm1\n"
+      "  movq %%mm1, 16(%[coeff])\n"
+      "  pshufw $0x55*3, %%mm3, %%mm1\n"
+      "  movq %%mm1, 24(%[coeff])\n"
+      :
+      : [s2_8] "r" (s2_8),
+        [s3_2] "r" (s3_2),
+        [coeff] "r" (tmp)
+      : "ecx");
+
+  __asm__ __volatile__("\n"
+      " .p2align 4,,15                  \n"
+      "1:\n"
+      /* load 128 */
+      "  pshufw $0x00, %%mm6, %%mm2\n"
+
+      "  movd 0(%[s1_np7]), %%mm0\n"
+      "  punpcklbw %%mm7, %%mm0\n"
+      "  movd 7(%[s1_np7]), %%mm1\n"
+      "  punpcklbw %%mm7, %%mm1\n"
+      "  paddw %%mm1, %%mm0\n"
+      "  pmullw 0(%[coeff]), %%mm0\n"
+      "  paddw %%mm0, %%mm2\n"
+
+      "  movd 1(%[s1_np7]), %%mm0\n"
+      "  punpcklbw %%mm7, %%mm0\n"
+      "  movd 6(%[s1_np7]), %%mm1\n"
+      "  punpcklbw %%mm7, %%mm1\n"
+      "  paddw %%mm1, %%mm0\n"
+      "  pmullw 8(%[coeff]), %%mm0\n"
+      "  paddw %%mm0, %%mm2\n"
+
+      "  movd 2(%[s1_np7]), %%mm0\n"
+      "  punpcklbw %%mm7, %%mm0\n"
+      "  movd 5(%[s1_np7]), %%mm1\n"
+      "  punpcklbw %%mm7, %%mm1\n"
+      "  paddw %%mm1, %%mm0\n"
+      "  pmullw 16(%[coeff]), %%mm0\n"
+      "  paddw %%mm0, %%mm2\n"
+
+      "  movd 3(%[s1_np7]), %%mm0\n"
+      "  punpcklbw %%mm7, %%mm0\n"
+      "  movd 4(%[s1_np7]), %%mm1\n"
+      "  punpcklbw %%mm7, %%mm1\n"
+      "  paddw %%mm1, %%mm0\n"
+      "  pmullw 24(%[coeff]), %%mm0\n"
+      "  paddw %%mm0, %%mm2\n"
+
+      "  psraw %%mm5, %%mm2\n"
+      "  pmaxsw %%mm7, %%mm2\n"
+      "  packuswb %%mm2, %%mm2\n"
+      "  movd %%mm2, 0(%[d])\n"
+      "  add $4, %[d]\n"
+      "  add $4, %[s1_np7]\n"
+      "  decl %[n]\n"
+      "  jnz 1b\n"
+      "  emms\n"
+      : [d] "+r" (d),
+        [s1_np7] "+r" (s1_np7),
+        [n] "+m" (n)
+      : [s2_8] "r" (s2_8),
+        [coeff] "r" (tmp)
+      : "ecx");
+}
+OIL_DEFINE_IMPL_FULL (mas8_u8_sym_mmx_41, mas8_u8_sym_l15, OIL_IMPL_FLAG_MMX|OIL_IMPL_FLAG_MMXEXT);
+
+
+#define PSHUFW_3210 "0xe4"
+#define PSHUFW_0123 "0x1b"
+
+void
+mas8_u8_sym_mmx_5 (uint8_t *d, const uint8_t *s1_np7, const int16_t *s2_8,
+        const int16_t *s3_2, int n)
+{
+  if (n==0) return;
+  __asm__ __volatile__("\n"
+      "  pxor %%mm7, %%mm7\n"
+
+      "  movzwl 0(%[s3_2]), %%ecx\n"
+      "  movd %%ecx, %%mm6\n"
+      "  pshufw $0x44, %%mm6, %%mm6\n" // 01 00 01 00
+
+      "  movzwl 2(%[s3_2]), %%ecx\n"
+      "  movd %%ecx, %%mm5\n"
+
+      "  cmpl $0, %[n]\n"
+      "  jz 2f\n"
+
+      "1:\n"
+      "  movd 0(%[s1_np7]), %%mm0\n"
+      "  punpcklbw %%mm7, %%mm0\n"
+#if 1
+      "  movd 4(%[s1_np7]), %%mm1\n"
+      "  punpcklbw %%mm7, %%mm1\n"
+      "  pshufw $0x1b, %%mm1, %%mm1\n" // 00 01 10 11
+      "  paddw %%mm1, %%mm0\n"
+      "  pmaddwd 0(%[s2_8]), %%mm0\n"
+#else
+      "  pmaddwd 0(%[s2_8]), %%mm0\n"
+
+      "  movd 4(%[s1_np7]), %%mm1\n"
+      "  punpcklbw %%mm7, %%mm1\n"
+      "  pmaddwd 8(%[s2_8]), %%mm1\n"
+      "  paddd %%mm1, %%mm0\n"
+#endif
+      
+      "  pshufw $0xee, %%mm0, %%mm1\n" // 11 10 11 10
+      "  paddd %%mm1, %%mm0\n"
+      "  paddd %%mm6, %%mm0\n"
+
+      "  psrad %%mm5, %%mm0\n"
+      "  pmaxsw %%mm7, %%mm0\n"
+      "  packuswb %%mm0, %%mm0\n"
+      "  movd %%mm0, %%ecx\n"
+      "  movb %%cl,0(%[d])\n"
+
+      "  add $1, %[d]\n"
+      "  add $1, %[s1_np7]\n"
+      "  decl %[n]\n"
+      "  jnz 1b\n"
+
+      "2:\n"
+      "  emms\n"
+      : [d] "+r" (d),
+        [s1_np7] "+r" (s1_np7),
+        [n] "+m" (n)
+      : [s2_8] "r" (s2_8),
+        [s3_2] "r" (s3_2)
+      : "ecx");
+}
+OIL_DEFINE_IMPL_FULL (mas8_u8_sym_mmx_5, mas8_u8_sym_l15, OIL_IMPL_FLAG_MMX|OIL_IMPL_FLAG_MMXEXT);
+
+void
+mas8_u8_sym_mmx_6 (uint8_t *d, const uint8_t *s1_np7, const int16_t *s2_8,
+        const int16_t *s3_2, int n)
+{
+  int8_t coeff[8];
+  int8_t *ack;
+  int i;
+
+  for(i=0;i<8;i++){
+    //coeff[i] = s2_8[i];
+    coeff[i] = i;
+  }
+  ack = coeff;
+
+  if (n==0) return;
+  __asm__ __volatile__("\n"
+      "  pxor %%mm7, %%mm7\n"
+
+      "  movzwl 0(%[s3_2]), %%ecx\n"
+      "  movd %%ecx, %%mm6\n"
+      "  pshufw $0x44, %%mm6, %%mm6\n" // 01 00 01 00
+
+      "  movzwl 2(%[s3_2]), %%ecx\n"
+      "  movd %%ecx, %%mm5\n"
+
+      "  movq 0(%[s2_8]), %%mm4\n"
+      "  packsswb 8(%[s2_8]), %%mm4\n"
+
+      "1:\n"
+      "  movq 0(%[s1_np7]), %%mm0\n"
+      "  pmaddubsw %%mm4, %%mm0\n"
+
+#if 1
+      "  pshufw $0xee, %%mm0, %%mm1\n" // 11 10 11 10
+      "  paddw %%mm1, %%mm0\n"
+      "  pshufw $0x55, %%mm0, %%mm1\n" // 01 01 01 01
+      "  paddw %%mm1, %%mm0\n"
+#else
+      "  phaddw %%mm0, %%mm0\n"
+      "  phaddw %%mm0, %%mm0\n"
+#endif
+
+      "  paddw %%mm6, %%mm0\n"
+      "  psraw %%mm5, %%mm0\n"
+      "  pmaxsw %%mm7, %%mm0\n"
+      "  packuswb %%mm0, %%mm0\n"
+      "  movd %%mm0, %%ecx\n"
+      "  movb %%cl,0(%[d])\n"
+
+      "  add $1, %[d]\n"
+      "  add $1, %[s1_np7]\n"
+      "  decl %[n]\n"
+      "  jnz 1b\n"
+
+      "  emms\n"
+      : [d] "+r" (d),
+        [s1_np7] "+r" (s1_np7),
+        [n] "+m" (n)
+      : [s2_8] "r" (s2_8),
+        [s3_2] "r" (s3_2)
+      : "ecx");
+}
+OIL_DEFINE_IMPL_FULL (mas8_u8_sym_mmx_6, mas8_u8_sym_l15, OIL_IMPL_FLAG_MMX|OIL_IMPL_FLAG_MMXEXT|OIL_IMPL_FLAG_SSSE3);
+
+#ifdef ENABLE_BROKEN_IMPLS
+/* This only works for the taps array: -1, 3, -7, 21, 21, -7, 3, -1 */
+void
+mas8_u8_supersym_mmx (uint8_t *d, const uint8_t *s1_np7, const int16_t *s2_8,
+        const int16_t *s3_2, int n)
+{
+  int j;
+  int x;
+
+  while(n&3) {
+    x = 0;
+    for(j=0;j<8;j++){
+      x += s1_np7[j] * s2_8[j];
+    }
+    *d = CLAMP((x + s3_2[0])>>s3_2[1],0,255);
+    d++;
+    s1_np7++;
+    n--;
+  }
+
+  n>>=2;
+  __asm__ __volatile__("\n"
+      "  pxor %%mm7, %%mm7\n"
+
+      "  movd (%[s3_2]), %%mm6\n"
+      "  pshufw $0x00, %%mm6, %%mm6\n"
+
+      "  movzwl 2(%[s3_2]), %%ecx\n"
+      "  movd %%ecx, %%mm5\n"
+
+      "  movq 0(%[s2_8]), %%mm3\n"
+      "  movq 8(%[s2_8]), %%mm4\n"
+
+      "1:\n"
+      "  movd 0(%[s1_np7]), %%mm0\n"
+      "  punpcklbw %%mm7, %%mm0\n"
+      "  movd 7(%[s1_np7]), %%mm1\n"
+      "  punpcklbw %%mm7, %%mm1\n"
+      "  paddw %%mm1, %%mm0\n"
+
+      "  movd 1(%[s1_np7]), %%mm2\n"
+      "  punpcklbw %%mm7, %%mm2\n"
+      "  movd 6(%[s1_np7]), %%mm3\n"
+      "  punpcklbw %%mm7, %%mm3\n"
+      "  paddw %%mm3, %%mm2\n"
+
+      "  paddw %%mm2, %%mm0\n"
+      "  psllw $2, %%mm2\n"
+      "  psubw %%mm0, %%mm2\n"
+      "      movq %%mm2, %%mm4\n"
+
+      "  movd 2(%[s1_np7]), %%mm0\n"
+      "  punpcklbw %%mm7, %%mm0\n"
+      "  movd 5(%[s1_np7]), %%mm1\n"
+      "  punpcklbw %%mm7, %%mm1\n"
+      "  paddw %%mm1, %%mm0\n"
+
+      "  movd 3(%[s1_np7]), %%mm2\n"
+      "  punpcklbw %%mm7, %%mm2\n"
+      "  movd 4(%[s1_np7]), %%mm3\n"
+      "  punpcklbw %%mm7, %%mm3\n"
+      "  paddw %%mm3, %%mm2\n"
+
+      "  paddw %%mm2, %%mm0\n"
+      "  psllw $2, %%mm2\n"
+      "  psubw %%mm0, %%mm2\n"
+
+      "  psubw %%mm2, %%mm4\n"
+      "  psllw $3, %%mm2\n"
+      "  paddw %%mm4, %%mm2\n"
+
+      "  paddw %%mm6, %%mm2\n"
+
+      "  psraw %%mm5, %%mm2\n"
+      "  pmaxsw %%mm7, %%mm2\n"
+      "  packuswb %%mm2, %%mm2\n"
+      "  movd %%mm2, 0(%[d])\n"
+      "  add $4, %[d]\n"
+      "  add $4, %[s1_np7]\n"
+      "  decl %[n]\n"
+      "  jnz 1b\n"
+      "  emms\n"
+      : [d] "+r" (d),
+        [s1_np7] "+r" (s1_np7),
+        [n] "+m" (n)
+      : [s2_8] "r" (s2_8),
+        [s3_2] "r" (s3_2)
+      : "ecx");
+}
+OIL_DEFINE_IMPL_FULL (mas8_u8_supersym_mmx, mas8_u8_sym_l15, OIL_IMPL_FLAG_MMX|OIL_IMPL_FLAG_MMXEXT);
+#endif
+
+void
+mas12_addc_rshift_decim2_u8_mmx_4 (uint8_t *d, const uint8_t *s1_2xnp11,
+    const int16_t *s2_12, const int16_t *s3_2, int n)
+{
+  if (n == 0) return;
+  __asm__ __volatile__("\n"
+      "  pxor %%mm7, %%mm7\n"
+
+      "  movzwl 0(%[s3_2]), %%ecx\n"
+      "  movd %%ecx, %%mm6\n"
+
+      "  movzwl 2(%[s3_2]), %%ecx\n"
+      "  movd %%ecx, %%mm5\n"
+
+      "1:\n"
+      "  movd 0(%[s1_2xnp11]), %%mm0\n"
+      "  punpcklbw %%mm7, %%mm0\n"
+      "  pmaddwd 0(%[s2_12]), %%mm0\n"
+
+      "  movd 4(%[s1_2xnp11]), %%mm1\n"
+      "  punpcklbw %%mm7, %%mm1\n"
+      "  pmaddwd 8(%[s2_12]), %%mm1\n"
+      "  paddd %%mm1, %%mm0\n"
+
+      "  movd 8(%[s1_2xnp11]), %%mm1\n"
+      "  punpcklbw %%mm7, %%mm1\n"
+      "  pmaddwd 16(%[s2_12]), %%mm1\n"
+      "  paddd %%mm1, %%mm0\n"
+
+      "  movq %%mm0, %%mm1\n"
+      "  psrlq $32, %%mm0\n"
+      "  paddd %%mm1, %%mm0\n"
+      "  paddd %%mm6, %%mm0\n"
+
+      "  psrad %%mm5, %%mm0\n"
+      "  pmaxsw %%mm7, %%mm0\n"
+      "  packuswb %%mm0, %%mm0\n"
+      "  movd %%mm0, %%ecx\n"
+      "  movb %%cl,0(%[d])\n"
+
+      "  add $1, %[d]\n"
+      "  add $2, %[s1_2xnp11]\n"
+      "  decl %[n]\n"
+      "  jnz 1b\n"
+      "  emms\n"
+      : [d] "+r" (d),
+        [s1_2xnp11] "+r" (s1_2xnp11),
+        [n] "+m" (n)
+      : [s2_12] "r" (s2_12),
+        [s3_2] "r" (s3_2)
+      : "ecx");
+}
+OIL_DEFINE_IMPL_FULL (mas12_addc_rshift_decim2_u8_mmx_4,
+    mas12_addc_rshift_decim2_u8, OIL_IMPL_FLAG_MMX|OIL_IMPL_FLAG_MMXEXT);
+
+#if 0
+void
+mas8_addc_rshift_decim2_u8_mmx_4 (uint8_t *d, const uint8_t *s1_2xnp9,
+    const int16_t *s2_8, const int16_t *s3_2, int n)
+{
+  if (n == 0) return;
+  __asm__ __volatile__("\n"
+      "  pxor %%mm7, %%mm7\n"
+
+      "  movzwl 0(%[s3_2]), %%ecx\n"
+      "  movd %%ecx, %%mm6\n"
+
+      "  movzwl 2(%[s3_2]), %%ecx\n"
+      "  movd %%ecx, %%mm5\n"
+
+      "1:\n"
+      "  movd 0(%[s1_2xnp9]), %%mm0\n"
+      "  punpcklbw %%mm7, %%mm0\n"
+      "  pmaddwd 0(%[s2_8]), %%mm0\n"
+
+      "  movd 4(%[s1_2xnp9]), %%mm1\n"
+      "  punpcklbw %%mm7, %%mm1\n"
+      "  pmaddwd 8(%[s2_8]), %%mm1\n"
+      "  paddd %%mm1, %%mm0\n"
+
+      "  movq %%mm0, %%mm1\n"
+      "  psrlq $32, %%mm0\n"
+      "  paddd %%mm1, %%mm0\n"
+      "  paddd %%mm6, %%mm0\n"
+
+      "  psrad %%mm5, %%mm0\n"
+      "  pmaxsw %%mm7, %%mm0\n"
+      "  packuswb %%mm0, %%mm0\n"
+      "  movd %%mm0, %%ecx\n"
+      "  movb %%cl,0(%[d])\n"
+
+      "  add $1, %[d]\n"
+      "  add $2, %[s1_2xnp9]\n"
+      "  decl %[n]\n"
+      "  jnz 1b\n"
+      "  emms\n"
+      : [d] "+r" (d),
+        [s1_2xnp9] "+r" (s1_2xnp9),
+        [n] "+m" (n)
+      : [s2_8] "r" (s2_8),
+        [s3_2] "r" (s3_2)
+      : "ecx");
+}
+OIL_DEFINE_IMPL_FULL (mas8_addc_rshift_decim2_u8_mmx_4,
+    mas8_addc_rshift_decim2_u8, OIL_IMPL_FLAG_MMX|OIL_IMPL_FLAG_MMXEXT);
+
+#endif
+
+void
+mas8_across_u8_mmx_3 (uint8_t *d, const uint8_t *s1_nx8, int ss1,
+    const int16_t *s2_8, const int16_t *s3_2, int n)
+{
+  int i;
+  int x;
+
+  while(n&3) {
+    x = 0;
+    for(i=0;i<8;i++){
+      x += OIL_GET(s1_nx8, i*ss1, uint8_t)*s2_8[i];
+    }
+    *d = CLAMP((x + s3_2[0])>>s3_2[1],0,255);
+    d++;
+    s1_nx8++;
+    n--;
+  }
+
+  if (n == 0) return;
+  n>>=2;
+  __asm__ __volatile__("\n"
+      "  pxor %%mm7, %%mm7\n"
+
+      "  movd (%[s3_2]), %%mm6\n"
+
+      "  movzwl 2(%[s3_2]), %%ecx\n"
+      "  movd %%ecx, %%mm5\n"
+
+      "  movq 0(%[s2_8]), %%mm3\n"
+      "  movq 8(%[s2_8]), %%mm4\n"
+      :
+      : [s2_8] "r" (s2_8),
+        [s3_2] "r" (s3_2)
+      : "ecx");
+
+  while (n > 0) {
+    const uint8_t *p = s1_nx8;
+  __asm__ __volatile__("\n"
+      "1:\n"
+      /* load 128 */
+      "  pshufw $0x00, %%mm6, %%mm2\n"
+
+      "  movd 0(%[p]), %%mm0\n"
+      "  add %[ss1], %[p]\n"
+      "  punpcklbw %%mm7, %%mm0\n"
+      "  pshufw $0x00, %%mm3, %%mm1\n"
+      "  pmullw %%mm1, %%mm0\n"
+      "  paddw %%mm0, %%mm2\n"
+
+      "  movd 0(%[p]), %%mm0\n"
+      "  add %[ss1], %[p]\n"
+      "  punpcklbw %%mm7, %%mm0\n"
+      "  pshufw $0x55*1, %%mm3, %%mm1\n"
+      "  pmullw %%mm1, %%mm0\n"
+      "  paddw %%mm0, %%mm2\n"
+
+      "  movd 0(%[p]), %%mm0\n"
+      "  add %[ss1], %[p]\n"
+      "  punpcklbw %%mm7, %%mm0\n"
+      "  pshufw $0x55*2, %%mm3, %%mm1\n"
+      "  pmullw %%mm1, %%mm0\n"
+      "  paddw %%mm0, %%mm2\n"
+
+      "  movd 0(%[p]), %%mm0\n"
+      "  add %[ss1], %[p]\n"
+      "  punpcklbw %%mm7, %%mm0\n"
+      "  pshufw $0x55*3, %%mm3, %%mm1\n"
+      "  pmullw %%mm1, %%mm0\n"
+      "  paddw %%mm0, %%mm2\n"
+
+      "  movd 0(%[p]), %%mm0\n"
+      "  add %[ss1], %[p]\n"
+      "  punpcklbw %%mm7, %%mm0\n"
+      "  pshufw $0x00, %%mm4, %%mm1\n"
+      "  pmullw %%mm1, %%mm0\n"
+      "  paddw %%mm0, %%mm2\n"
+
+      "  movd 0(%[p]), %%mm0\n"
+      "  add %[ss1], %[p]\n"
+      "  punpcklbw %%mm7, %%mm0\n"
+      "  pshufw $0x55*1, %%mm4, %%mm1\n"
+      "  pmullw %%mm1, %%mm0\n"
+      "  paddw %%mm0, %%mm2\n"
+
+      "  movd 0(%[p]), %%mm0\n"
+      "  add %[ss1], %[p]\n"
+      "  punpcklbw %%mm7, %%mm0\n"
+      "  pshufw $0x55*2, %%mm4, %%mm1\n"
+      "  pmullw %%mm1, %%mm0\n"
+      "  paddw %%mm0, %%mm2\n"
+
+      "  movd 0(%[p]), %%mm0\n"
+      "  add %[ss1], %[p]\n"
+      "  punpcklbw %%mm7, %%mm0\n"
+      "  pshufw $0x55*3, %%mm4, %%mm1\n"
+      "  pmullw %%mm1, %%mm0\n"
+      "  paddw %%mm0, %%mm2\n"
+
+      "  psraw %%mm5, %%mm2\n"
+      "  pmaxsw %%mm7, %%mm2\n"
+      "  packuswb %%mm2, %%mm2\n"
+      "  movd %%mm2, 0(%[d])\n"
+      : [p] "+r" (p)
+      : [d] "r" (d), [ss1] "r" ((long)ss1));
+    d+=4;
+    s1_nx8+=4;
+    n--;
+  }
+
+  asm volatile ("emms");
+}
+OIL_DEFINE_IMPL_FULL (mas8_across_u8_mmx_3, mas8_across_u8, OIL_IMPL_FLAG_MMX|OIL_IMPL_FLAG_MMXEXT);
+
diff --git a/liboil/i386_amd64/multiply_and_acc.c b/liboil/i386_amd64/multiply_and_acc.c
new file mode 100644
index 0000000..c7d6f7f
--- /dev/null
+++ b/liboil/i386_amd64/multiply_and_acc.c
@@ -0,0 +1,250 @@
+
+#include <liboil/liboilfunction.h>
+#include <liboil/liboilclasses.h>
+
+void
+multiply_and_acc_6xn_s16_u8_mmx (int16_t *i1, int is1, int16_t *s1,
+    int ss1, uint8_t *s2, int ss2, int n)
+{
+  /* FIXME this reads outside the arrays.  Bad. */
+  if (n==0) return;
+  __asm__ __volatile__ ("\n"
+      "  pxor %%mm7, %%mm7\n"
+      "1:\n"
+      "  movd 0(%2), %%mm0\n"
+      "  punpcklbw %%mm7, %%mm0\n"
+      "  pmullw 0(%1), %%mm0\n"
+      "  paddw 0(%0), %%mm0\n"
+      "  movq %%mm0, 0(%0)\n"
+      "   movd 4(%2), %%mm1\n"
+      "   punpcklbw %%mm7, %%mm1\n"
+      "   pmullw 8(%1), %%mm1\n"
+      "   paddw 8(%0), %%mm1\n"
+      "   movd %%mm1, 8(%0)\n"
+
+      "  add %4, %0\n"
+      "  add %5, %1\n"
+      "  add %6, %2\n"
+      "  decl %3\n"
+      "  jnz 1b\n"
+      "  emms\n"
+      : "+r" (i1), "+r" (s1), "+r" (s2), "+m" (n)
+      : "r" ((long)is1), "r" ((long)ss1), "r" ((long)ss2)
+      );
+}
+OIL_DEFINE_IMPL_FULL (multiply_and_acc_6xn_s16_u8_mmx,
+    multiply_and_acc_6xn_s16_u8, OIL_IMPL_FLAG_MMX);
+
+void
+multiply_and_acc_8xn_s16_u8_mmx (int16_t *i1, int is1, int16_t *s1,
+    int ss1, uint8_t *s2, int ss2, int n)
+{
+  if (n==0) return;
+  __asm__ __volatile__ ("\n"
+      "  pxor %%mm7, %%mm7\n"
+      "1:\n"
+      "  movd 0(%2), %%mm0\n"
+      "  punpcklbw %%mm7, %%mm0\n"
+      "  pmullw 0(%1), %%mm0\n"
+      "  paddw 0(%0), %%mm0\n"
+      "  movq %%mm0, 0(%0)\n"
+      "   movd 4(%2), %%mm1\n"
+      "   punpcklbw %%mm7, %%mm1\n"
+      "   pmullw 8(%1), %%mm1\n"
+      "   paddw 8(%0), %%mm1\n"
+      "   movq %%mm1, 8(%0)\n"
+
+      "  add %4, %0\n"
+      "  add %5, %1\n"
+      "  add %6, %2\n"
+      "  decl %3\n"
+      "  jnz 1b\n"
+      "  emms\n"
+      : "+r" (i1), "+r" (s1), "+r" (s2), "+m" (n)
+      : "r" ((long)is1), "r" ((long)ss1), "r" ((long)ss2)
+      );
+}
+OIL_DEFINE_IMPL_FULL (multiply_and_acc_8xn_s16_u8_mmx,
+    multiply_and_acc_8xn_s16_u8, OIL_IMPL_FLAG_MMX);
+
+void
+multiply_and_acc_16xn_s16_u8_mmx (int16_t *i1, int is1, int16_t *s1,
+    int ss1, uint8_t *s2, int ss2, int n)
+{
+  if (n==0) return;
+  __asm__ __volatile__ ("\n"
+      "  pxor %%mm7, %%mm7\n"
+      "1:\n"
+      "  movd 0(%2), %%mm0\n"
+      "  punpcklbw %%mm7, %%mm0\n"
+      "  pmullw 0(%1), %%mm0\n"
+      "  paddw 0(%0), %%mm0\n"
+      "  movq %%mm0, 0(%0)\n"
+      "   movd 4(%2), %%mm1\n"
+      "   punpcklbw %%mm7, %%mm1\n"
+      "   pmullw 8(%1), %%mm1\n"
+      "   paddw 8(%0), %%mm1\n"
+      "   movq %%mm1, 8(%0)\n"
+      "    movd 8(%2), %%mm2\n"
+      "    punpcklbw %%mm7, %%mm2\n"
+      "    pmullw 16(%1), %%mm2\n"
+      "    paddw 16(%0), %%mm2\n"
+      "    movq %%mm2, 16(%0)\n"
+      "     movd 12(%2), %%mm2\n"
+      "     punpcklbw %%mm7, %%mm2\n"
+      "     pmullw 24(%1), %%mm2\n"
+      "     paddw 24(%0), %%mm2\n"
+      "     movq %%mm2, 24(%0)\n"
+
+      "  add %4, %0\n"
+      "  add %5, %1\n"
+      "  add %6, %2\n"
+      "  decl %3\n"
+      "  jnz 1b\n"
+      "  emms\n"
+      : "+r" (i1), "+r" (s1), "+r" (s2), "+m" (n)
+      : "r" ((long)is1), "r" ((long)ss1), "r" ((long)ss2)
+      );
+}
+OIL_DEFINE_IMPL_FULL (multiply_and_acc_16xn_s16_u8_mmx,
+    multiply_and_acc_16xn_s16_u8, OIL_IMPL_FLAG_MMX);
+
+void
+multiply_and_acc_24xn_s16_u8_mmx (int16_t *i1, int is1, int16_t *s1,
+    int ss1, uint8_t *s2, int ss2, int n)
+{
+  if (n==0) return;
+  __asm__ __volatile__ ("\n"
+      "  pxor %%mm7, %%mm7\n"
+      "1:\n"
+      "  movd 0(%2), %%mm0\n"
+      "  punpcklbw %%mm7, %%mm0\n"
+      "  pmullw 0(%1), %%mm0\n"
+      "  paddw 0(%0), %%mm0\n"
+      "  movq %%mm0, 0(%0)\n"
+      "   movd 4(%2), %%mm1\n"
+      "   punpcklbw %%mm7, %%mm1\n"
+      "   pmullw 8(%1), %%mm1\n"
+      "   paddw 8(%0), %%mm1\n"
+      "   movq %%mm1, 8(%0)\n"
+      "    movd 8(%2), %%mm2\n"
+      "    punpcklbw %%mm7, %%mm2\n"
+      "    pmullw 16(%1), %%mm2\n"
+      "    paddw 16(%0), %%mm2\n"
+      "    movq %%mm2, 16(%0)\n"
+      "     movd 12(%2), %%mm2\n"
+      "     punpcklbw %%mm7, %%mm2\n"
+      "     pmullw 24(%1), %%mm2\n"
+      "     paddw 24(%0), %%mm2\n"
+      "     movq %%mm2, 24(%0)\n"
+      " movd 16(%2), %%mm2\n"
+      " punpcklbw %%mm7, %%mm2\n"
+      " pmullw 32(%1), %%mm2\n"
+      " paddw 32(%0), %%mm2\n"
+      " movq %%mm2, 32(%0)\n"
+      "  movd 20(%2), %%mm2\n"
+      "  punpcklbw %%mm7, %%mm2\n"
+      "  pmullw 40(%1), %%mm2\n"
+      "  paddw 40(%0), %%mm2\n"
+      "  movq %%mm2, 40(%0)\n"
+
+      "  add %4, %0\n"
+      "  add %5, %1\n"
+      "  add %6, %2\n"
+      "  decl %3\n"
+      "  jnz 1b\n"
+      "  emms\n"
+      : "+r" (i1), "+r" (s1), "+r" (s2), "+m" (n)
+      : "r" ((long)is1), "r" ((long)ss1), "r" ((long)ss2)
+      );
+}
+OIL_DEFINE_IMPL_FULL (multiply_and_acc_24xn_s16_u8_mmx,
+    multiply_and_acc_24xn_s16_u8, OIL_IMPL_FLAG_MMX);
+
+
+#if 0
+void
+mas2_add_s16_sse (int16_t *d1, int16_t *s1, int16_t *s2, int16_t *s3_2,
+    int16_t *s4_2, int n)
+{
+  int shift = s4_2[1];
+
+  while (n&7) {
+    int x;
+
+    x = s4_2[0] + s2[0]*s3_2[0] + s2[1]*s3_2[1];
+    x >>= s4_2[1];
+    d1[0] = s1[0] + x;
+
+    d1++;
+    s1++;
+    s2++;
+    n--;
+  }
+  if (n==0) return;
+
+  n>>=3;
+  asm volatile ("\n"
+      "  movzwl 0(%0), %%ecx\n"
+      "  movd %%ecx, %%xmm7\n"
+      "  pshuflw $0x00, %%xmm7, %%xmm7\n"
+      "  pshufd $0x00, %%xmm7, %%xmm7\n"
+      "  movzwl 2(%0), %%ecx\n"
+      "  movd %%ecx, %%xmm6\n"
+      "  pshuflw $0x00, %%xmm6, %%xmm6\n"
+      "  pshufd $0x00, %%xmm6, %%xmm6\n"
+      "  movzwl 0(%1), %%ecx\n"
+      "  movd %%ecx, %%xmm5\n"
+      "  pshuflw $0x44, %%xmm5, %%xmm5\n"
+      "  pshufd $0x00, %%xmm5, %%xmm5\n"
+      :: "r" (s3_2), "r" (s4_2)
+      : "ecx"
+      );
+  asm volatile ("\n"
+      "1:\n"
+      "  movdqu 0(%2), %%xmm0\n"       // mm0 = s0, s1, s2, s3
+      "  movdqu 0(%2), %%xmm1\n"       // mm1 = s0, s1, s2, s3
+      "  pmullw %%xmm7, %%xmm0\n"     // mm0 = lo(s0*a0), lo(s1*a0), ...
+      "  pmulhw %%xmm7, %%xmm1\n"     // mm1 = hi(s0*a0), hi(s1*a0), ...
+      "  movdqu %%xmm0, %%xmm2\n"       // mm2 = lo(s0*a0), lo(s1*a0), ...
+      "  punpcklwd %%xmm1, %%xmm0\n"  // mm0 = s0*a0, s1*a0
+      "  punpckhwd %%xmm1, %%xmm2\n"  // mm2 = s2*a0, s3*a0
+      "  movdqu %%xmm2, %%xmm1\n"       // mm1 = s2*a0, s3*a0
+
+      "  movdqu 2(%2), %%xmm2\n"
+      "  movdqu 2(%2), %%xmm3\n"
+      "  pmullw %%xmm6, %%xmm2\n"
+      "  pmulhw %%xmm6, %%xmm3\n"
+      "  movdqu %%xmm2, %%xmm4\n"
+      "  punpcklwd %%xmm3, %%xmm2\n"  // mm2 = s1*a1, s2*a1
+      "  punpckhwd %%xmm3, %%xmm4\n"  // mm4 = s3*a1, s4*a1
+      "  movdqu %%xmm4, %%xmm3\n"       // mm3 = s3*a1, s4*a1
+
+      "  paddd %%xmm3, %%xmm1\n"      // mm1 = s2*a0 + s3*a1, ...
+      "  paddd %%xmm2, %%xmm0\n"      // mm0 = s0*a0 + s1*a1, ...
+
+      "  paddd %%xmm5, %%xmm1\n"      // mm1 = s2*a0 + s3*a1 + offset, ...
+      "  paddd %%xmm5, %%xmm0\n"      // mm0 = s0*a0 + s1*a1 + offset, ...
+
+      "  movd %4, %%xmm4\n"
+      "  psrad %%xmm4, %%xmm1\n"      // mm1 = (s2*a0 + s3*a1 + offset)>>shift, ...
+      "  psrad %%xmm4, %%xmm0\n"      // mm0 = (s0*a0 + s1*a1 + offset)>>shift, ...
+
+      "  packssdw %%xmm1, %%xmm0\n"
+      "  paddw 0(%1), %%xmm0\n"
+      "  movdqu %%xmm0, 0(%0)\n"
+      "  add $16, %0\n"
+      "  add $16, %1\n"
+      "  add $16, %2\n"
+      "  decl %3\n"
+      "  jnz 1b\n"
+      : "+r" (d1), "+r" (s1), "+r" (s2), "+r" (n)
+      : "r" (shift)
+      );
+}
+OIL_DEFINE_IMPL_FULL (mas2_add_s16_sse, mas2_add_s16, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT);
+#endif
+
+
+
+


More information about the Liboil-commit mailing list