[Liboil-commit] liboil/i386 liboil/i386_amd64
David Schleef
ds at kemper.freedesktop.org
Thu Feb 21 18:35:46 PST 2008
liboil/i386/Makefile.am | 1
liboil/i386/mas.c | 1114 -----------------------------------
liboil/i386/wavelet.c | 727 ----------------------
liboil/i386_amd64/Makefile.am | 3
liboil/i386_amd64/add2.c | 488 +++++++++++++++
liboil/i386_amd64/mas.c | 1114 +++++++++++++++++++++++++++++++++++
liboil/i386_amd64/multiply_and_acc.c | 250 +++++++
7 files changed, 1855 insertions(+), 1842 deletions(-)
New commits:
commit 205a167d5185b9531ed008c36bc68a9bdc4cac2c
Author: David Schleef <ds at wallace.bigkitten.com>
Date: Thu Feb 21 18:35:42 2008 -0800
Convert some i386 code to dual-arch
diff --git a/liboil/i386/Makefile.am b/liboil/i386/Makefile.am
index dfd07c1..6f4a41e 100644
--- a/liboil/i386/Makefile.am
+++ b/liboil/i386/Makefile.am
@@ -8,7 +8,6 @@ libi386_la_SOURCES = \
copy8x8_i386.c \
diff8x8_i386.c \
error8x8_i386.c \
- mas.c \
md5_i386.c \
mult8x8_i386.c \
recon8x8_i386.c \
diff --git a/liboil/i386/mas.c b/liboil/i386/mas.c
deleted file mode 100644
index b8d4e00..0000000
--- a/liboil/i386/mas.c
+++ /dev/null
@@ -1,1114 +0,0 @@
-
-#include <liboil/liboilfunction.h>
-#include <liboil/liboilclasses.h>
-
-
-void
-mas10_u8_mmx (uint8_t *d, const uint8_t *s1_np9, const int16_t *s2_10,
- const int16_t *s3_2, int n)
-{
- int j;
- int x;
-
- while(n&3) {
- x = 0;
- for(j=0;j<10;j++){
- x += s1_np9[j] * s2_10[j];
- }
- *d = CLAMP((x + s3_2[0])>>s3_2[1],0,255);
- d++;
- s1_np9++;
- n--;
- }
-
- if (n == 0) return;
- n>>=2;
- __asm__ __volatile__("\n"
- " pxor %%mm7, %%mm7\n"
-
- " movd (%[s3_2]), %%mm6\n"
-
- " movzwl 2(%[s3_2]), %%ecx\n"
- " movd %%ecx, %%mm5\n"
-
- "1:\n"
- /* load 128 */
- " pshufw $0x00, %%mm6, %%mm2\n"
-
-#define LOOP(x) \
- " movd " #x "(%[s1_np9]), %%mm0\n" \
- " punpcklbw %%mm7, %%mm0\n" \
- " movq 2*" #x "(%[s2_10]), %%mm1\n" \
- " pshufw $0x00, %%mm1, %%mm1\n" \
- " pmullw %%mm1, %%mm0\n" \
- " paddw %%mm0, %%mm2\n"
-
- LOOP(0)
- LOOP(1)
- LOOP(2)
- LOOP(3)
- LOOP(4)
- LOOP(5)
- LOOP(6)
- LOOP(7)
- LOOP(8)
- LOOP(9)
-#undef LOOP
-
- " psraw %%mm5, %%mm2\n"
- " pmaxsw %%mm7, %%mm2\n"
- " packuswb %%mm2, %%mm2\n"
- " movd %%mm2, 0(%[d])\n"
- " addl $4, %[d]\n"
- " addl $4, %[s1_np9]\n"
- " decl %[n]\n"
- " jnz 1b\n"
- " emms\n"
- : [d] "+r" (d),
- [s1_np9] "+r" (s1_np9),
- [n] "+m" (n)
- : [s2_10] "r" (s2_10),
- [s3_2] "r" (s3_2)
- : "ecx");
-}
-OIL_DEFINE_IMPL_FULL (mas10_u8_mmx, mas10_u8_l15, OIL_IMPL_FLAG_MMX|OIL_IMPL_FLAG_MMXEXT);
-
-void
-mas10_u8_mmx_2 (uint8_t *d, const uint8_t *s1_np9, const int16_t *s2_10,
- const int16_t *s3_2, int n)
-{
- int j;
- int x;
- int16_t coeff[4*10];
- int16_t *ptr;
-
- ptr = coeff;
-
- while(n&3) {
- x = 0;
- for(j=0;j<10;j++){
- x += s1_np9[j] * s2_10[j];
- }
- *d = CLAMP((x + s3_2[0])>>s3_2[1],0,255);
- d++;
- s1_np9++;
- n--;
- }
-
- for(j=0;j<10;j++){
- ptr[4*j + 0] = s2_10[j];
- ptr[4*j + 1] = s2_10[j];
- ptr[4*j + 2] = s2_10[j];
- ptr[4*j + 3] = s2_10[j];
- }
-
- if (n == 0) return;
- n>>=2;
- __asm__ __volatile__("\n"
- " pxor %%mm7, %%mm7\n"
-
- " movd (%[s3_2]), %%mm6\n"
-
- " movzwl 2(%[s3_2]), %%ecx\n"
- " movd %%ecx, %%mm5\n"
-
- "1:\n"
- /* load 128 */
- " pshufw $0x00, %%mm6, %%mm2\n"
-
-#define LOOP(x) \
- " movd " #x "(%[s1_np9]), %%mm0\n" \
- " punpcklbw %%mm7, %%mm0\n" \
- " pmullw 8*" #x "(%[coeff]), %%mm0\n" \
- " paddw %%mm0, %%mm2\n"
-
- LOOP(0)
- LOOP(1)
- LOOP(2)
- LOOP(3)
- LOOP(4)
- LOOP(5)
- LOOP(6)
- LOOP(7)
- LOOP(8)
- LOOP(9)
-#undef LOOP
-
- " psraw %%mm5, %%mm2\n"
- " pmaxsw %%mm7, %%mm2\n"
- " packuswb %%mm2, %%mm2\n"
- " movd %%mm2, 0(%[d])\n"
- " addl $4, %[d]\n"
- " addl $4, %[s1_np9]\n"
- " decl %[n]\n"
- " jnz 1b\n"
- " emms\n"
- : [d] "+r" (d),
- [s1_np9] "+r" (s1_np9),
- [n] "+m" (n)
- : [coeff] "r" (ptr),
- [s3_2] "r" (s3_2)
- : "ecx");
-}
-OIL_DEFINE_IMPL_FULL (mas10_u8_mmx_2, mas10_u8_l15, OIL_IMPL_FLAG_MMX|OIL_IMPL_FLAG_MMXEXT);
-
-void
-mas10_u8_mmx_3 (uint8_t *d, const uint8_t *s1_np9, const int16_t *s2_10,
- const int16_t *s3_2, int n)
-{
- int j;
- int x;
-
- while(n&3) {
- x = 0;
- for(j=0;j<10;j++){
- x += s1_np9[j] * s2_10[j];
- }
- *d = CLAMP((x + s3_2[0])>>s3_2[1],0,255);
- d++;
- s1_np9++;
- n--;
- }
-
- if (n == 0) return;
- n>>=2;
- __asm__ __volatile__("\n"
- " pxor %%mm7, %%mm7\n"
-
- " movd (%[s3_2]), %%mm6\n"
-
- " movzwl 2(%[s3_2]), %%ecx\n"
- " movd %%ecx, %%mm5\n"
-
- " movq 0(%[s2_10]), %%mm3\n"
- " movq 8(%[s2_10]), %%mm4\n"
-
- "1:\n"
- /* load 128 */
- " pshufw $0x00, %%mm6, %%mm2\n"
-
-#define LOOP(x) \
- " movd " #x "(%[s1_np9]), %%mm0\n" \
- " punpcklbw %%mm7, %%mm0\n" \
- " movq 2*" #x "(%[s2_10]), %%mm1\n" \
- " pshufw $0x00, %%mm1, %%mm1\n" \
- " pmullw %%mm1, %%mm0\n" \
- " paddw %%mm0, %%mm2\n"
-
- //LOOP(0)
- " movd 0(%[s1_np9]), %%mm0\n"
- " punpcklbw %%mm7, %%mm0\n"
- " pshufw $0x00, %%mm3, %%mm1\n"
- " pmullw %%mm1, %%mm0\n"
- " paddw %%mm0, %%mm2\n"
-
- //LOOP(1)
- " movd 1(%[s1_np9]), %%mm0\n"
- " punpcklbw %%mm7, %%mm0\n"
- " pshufw $0x55*1, %%mm3, %%mm1\n"
- " pmullw %%mm1, %%mm0\n"
- " paddw %%mm0, %%mm2\n"
-
- //LOOP(2)
- " movd 2(%[s1_np9]), %%mm0\n"
- " punpcklbw %%mm7, %%mm0\n"
- " pshufw $0x55*2, %%mm3, %%mm1\n"
- " pmullw %%mm1, %%mm0\n"
- " paddw %%mm0, %%mm2\n"
-
- //LOOP(3)
- " movd 3(%[s1_np9]), %%mm0\n"
- " punpcklbw %%mm7, %%mm0\n"
- " pshufw $0x55*3, %%mm3, %%mm1\n"
- " pmullw %%mm1, %%mm0\n"
- " paddw %%mm0, %%mm2\n"
-
- //LOOP(4)
- " movd 4(%[s1_np9]), %%mm0\n"
- " punpcklbw %%mm7, %%mm0\n"
- " pshufw $0x00, %%mm4, %%mm1\n"
- " pmullw %%mm1, %%mm0\n"
- " paddw %%mm0, %%mm2\n"
-
- //LOOP(5)
- " movd 5(%[s1_np9]), %%mm0\n"
- " punpcklbw %%mm7, %%mm0\n"
- " pshufw $0x55*1, %%mm4, %%mm1\n"
- " pmullw %%mm1, %%mm0\n"
- " paddw %%mm0, %%mm2\n"
-
- //LOOP(6)
- " movd 6(%[s1_np9]), %%mm0\n"
- " punpcklbw %%mm7, %%mm0\n"
- " pshufw $0x55*2, %%mm4, %%mm1\n"
- " pmullw %%mm1, %%mm0\n"
- " paddw %%mm0, %%mm2\n"
-
- //LOOP(7)
- " movd 7(%[s1_np9]), %%mm0\n"
- " punpcklbw %%mm7, %%mm0\n"
- " pshufw $0x55*3, %%mm4, %%mm1\n"
- " pmullw %%mm1, %%mm0\n"
- " paddw %%mm0, %%mm2\n"
-
- LOOP(8)
- LOOP(9)
-#undef LOOP
-
- " psraw %%mm5, %%mm2\n"
- " pmaxsw %%mm7, %%mm2\n"
- " packuswb %%mm2, %%mm2\n"
- " movd %%mm2, 0(%[d])\n"
- " addl $4, %[d]\n"
- " addl $4, %[s1_np9]\n"
- " decl %[n]\n"
- " jnz 1b\n"
- " emms\n"
- : [d] "+r" (d),
- [s1_np9] "+r" (s1_np9),
- [n] "+m" (n)
- : [s2_10] "r" (s2_10),
- [s3_2] "r" (s3_2)
- : "ecx");
-}
-OIL_DEFINE_IMPL_FULL (mas10_u8_mmx_3, mas10_u8_l15, OIL_IMPL_FLAG_MMX|OIL_IMPL_FLAG_MMXEXT);
-
-void
-mas10_u8_mmx_4 (uint8_t *d, const uint8_t *s1_np9, const int16_t *s2_10,
- const int16_t *s3_2, int n)
-{
- if (n == 0) return;
- __asm__ __volatile__("\n"
- " pxor %%mm7, %%mm7\n"
-
- " movzwl 0(%[s3_2]), %%ecx\n"
- " movd %%ecx, %%mm6\n"
-
- " movzwl 2(%[s3_2]), %%ecx\n"
- " movd %%ecx, %%mm5\n"
-
- "1:\n"
- " movd 0(%[s1_np9]), %%mm0\n"
- " punpcklbw %%mm7, %%mm0\n"
- " pmaddwd 0(%[s2_10]), %%mm0\n"
-
- " movd 4(%[s1_np9]), %%mm1\n"
- " punpcklbw %%mm7, %%mm1\n"
- " pmaddwd 8(%[s2_10]), %%mm1\n"
-
- " movd 8(%[s1_np9]), %%mm2\n"
- " punpcklbw %%mm7, %%mm2\n"
- " pmaddwd 16(%[s2_10]), %%mm2\n"
-
- " paddd %%mm1, %%mm0\n"
- " movq %%mm0, %%mm1\n"
- " psrlq $32, %%mm0\n"
- " paddd %%mm1, %%mm0\n"
- " paddd %%mm2, %%mm0\n"
- " paddd %%mm6, %%mm0\n"
-
- " psrad %%mm5, %%mm0\n"
- " pmaxsw %%mm7, %%mm0\n"
- " packuswb %%mm0, %%mm0\n"
- " movd %%mm0, %%ecx\n"
- " movb %%cl,0(%[d])\n"
-
- " addl $1, %[d]\n"
- " addl $1, %[s1_np9]\n"
- " decl %[n]\n"
- " jnz 1b\n"
- " emms\n"
- : [d] "+r" (d),
- [s1_np9] "+r" (s1_np9),
- [n] "+m" (n)
- : [s2_10] "r" (s2_10),
- [s3_2] "r" (s3_2)
- : "ecx");
-}
-OIL_DEFINE_IMPL_FULL (mas10_u8_mmx_4, mas10_u8, OIL_IMPL_FLAG_MMX|OIL_IMPL_FLAG_MMXEXT);
-
-
-void
-mas8_u8_mmx_3 (uint8_t *d, const uint8_t *s1_np7, const int16_t *s2_8,
- const int16_t *s3_2, int n)
-{
- int j;
- int x;
-
- while(n&3) {
- x = 0;
- for(j=0;j<8;j++){
- x += s1_np7[j] * s2_8[j];
- }
- *d = CLAMP((x + s3_2[0])>>s3_2[1],0,255);
- d++;
- s1_np7++;
- n--;
- }
-
- if (n == 0) return;
- n>>=2;
- __asm__ __volatile__("\n"
- " pxor %%mm7, %%mm7\n"
-
- " movd (%[s3_2]), %%mm6\n"
-
- " movzwl 2(%[s3_2]), %%ecx\n"
- " movd %%ecx, %%mm5\n"
-
- " movq 0(%[s2_8]), %%mm3\n"
- " movq 8(%[s2_8]), %%mm4\n"
-
- "1:\n"
- /* load 128 */
- " pshufw $0x00, %%mm6, %%mm2\n"
-
- " movd 0(%[s1_np7]), %%mm0\n"
- " punpcklbw %%mm7, %%mm0\n"
- " pshufw $0x00, %%mm3, %%mm1\n"
- " pmullw %%mm1, %%mm0\n"
- " paddw %%mm0, %%mm2\n"
-
- " movd 1(%[s1_np7]), %%mm0\n"
- " punpcklbw %%mm7, %%mm0\n"
- " pshufw $0x55*1, %%mm3, %%mm1\n"
- " pmullw %%mm1, %%mm0\n"
- " paddw %%mm0, %%mm2\n"
-
- " movd 2(%[s1_np7]), %%mm0\n"
- " punpcklbw %%mm7, %%mm0\n"
- " pshufw $0x55*2, %%mm3, %%mm1\n"
- " pmullw %%mm1, %%mm0\n"
- " paddw %%mm0, %%mm2\n"
-
- " movd 3(%[s1_np7]), %%mm0\n"
- " punpcklbw %%mm7, %%mm0\n"
- " pshufw $0x55*3, %%mm3, %%mm1\n"
- " pmullw %%mm1, %%mm0\n"
- " paddw %%mm0, %%mm2\n"
-
- " movd 4(%[s1_np7]), %%mm0\n"
- " punpcklbw %%mm7, %%mm0\n"
- " pshufw $0x00, %%mm4, %%mm1\n"
- " pmullw %%mm1, %%mm0\n"
- " paddw %%mm0, %%mm2\n"
-
- " movd 5(%[s1_np7]), %%mm0\n"
- " punpcklbw %%mm7, %%mm0\n"
- " pshufw $0x55*1, %%mm4, %%mm1\n"
- " pmullw %%mm1, %%mm0\n"
- " paddw %%mm0, %%mm2\n"
-
- " movd 6(%[s1_np7]), %%mm0\n"
- " punpcklbw %%mm7, %%mm0\n"
- " pshufw $0x55*2, %%mm4, %%mm1\n"
- " pmullw %%mm1, %%mm0\n"
- " paddw %%mm0, %%mm2\n"
-
- " movd 7(%[s1_np7]), %%mm0\n"
- " punpcklbw %%mm7, %%mm0\n"
- " pshufw $0x55*3, %%mm4, %%mm1\n"
- " pmullw %%mm1, %%mm0\n"
- " paddw %%mm0, %%mm2\n"
-
- " psraw %%mm5, %%mm2\n"
- " pmaxsw %%mm7, %%mm2\n"
- " packuswb %%mm2, %%mm2\n"
- " movd %%mm2, 0(%[d])\n"
- " addl $4, %[d]\n"
- " addl $4, %[s1_np7]\n"
- " decl %[n]\n"
- " jnz 1b\n"
- " emms\n"
- : [d] "+r" (d),
- [s1_np7] "+r" (s1_np7),
- [n] "+m" (n)
- : [s2_8] "r" (s2_8),
- [s3_2] "r" (s3_2)
- : "ecx");
-}
-OIL_DEFINE_IMPL_FULL (mas8_u8_mmx_3, mas8_u8_l15, OIL_IMPL_FLAG_MMX|OIL_IMPL_FLAG_MMXEXT);
-
-void
-mas8_u8_mmx_4 (uint8_t *d, const uint8_t *s1_np7, const int16_t *s2_8,
- const int16_t *s3_2, int n)
-{
- if (n == 0) return;
- __asm__ __volatile__("\n"
- " pxor %%mm7, %%mm7\n"
-
- " movzwl 0(%[s3_2]), %%ecx\n"
- " movd %%ecx, %%mm6\n"
-
- " movzwl 2(%[s3_2]), %%ecx\n"
- " movd %%ecx, %%mm5\n"
-
- "1:\n"
- " movd 0(%[s1_np7]), %%mm0\n"
- " punpcklbw %%mm7, %%mm0\n"
- " pmaddwd 0(%[s2_8]), %%mm0\n"
-
- " movd 4(%[s1_np7]), %%mm1\n"
- " punpcklbw %%mm7, %%mm1\n"
- " pmaddwd 8(%[s2_8]), %%mm1\n"
-
- " paddd %%mm1, %%mm0\n"
- " movq %%mm0, %%mm1\n"
- " psrlq $32, %%mm0\n"
- " paddd %%mm1, %%mm0\n"
- " paddd %%mm6, %%mm0\n"
-
- " psrad %%mm5, %%mm0\n"
- " pmaxsw %%mm7, %%mm0\n"
- " packuswb %%mm0, %%mm0\n"
- " movd %%mm0, %%ecx\n"
- " movb %%cl,0(%[d])\n"
-
- " addl $1, %[d]\n"
- " addl $1, %[s1_np7]\n"
- " decl %[n]\n"
- " jnz 1b\n"
- " emms\n"
- : [d] "+r" (d),
- [s1_np7] "+r" (s1_np7),
- [n] "+m" (n)
- : [s2_8] "r" (s2_8),
- [s3_2] "r" (s3_2)
- : "ecx");
-}
-OIL_DEFINE_IMPL_FULL (mas8_u8_mmx_4, mas8_u8, OIL_IMPL_FLAG_MMX|OIL_IMPL_FLAG_MMXEXT);
-
-void
-mas8_u8_sym_mmx_3 (uint8_t *d, const uint8_t *s1_np7, const int16_t *s2_8,
- const int16_t *s3_2, int n)
-{
- int j;
- int x;
-
- while(n&3) {
- x = 0;
- for(j=0;j<8;j++){
- x += s1_np7[j] * s2_8[j];
- }
- *d = CLAMP((x + s3_2[0])>>s3_2[1],0,255);
- d++;
- s1_np7++;
- n--;
- }
-
- if (n == 0) return;
- n>>=2;
- __asm__ __volatile__("\n"
- " pxor %%mm7, %%mm7\n"
-
- " movd (%[s3_2]), %%mm6\n"
-
- " movzwl 2(%[s3_2]), %%ecx\n"
- " movd %%ecx, %%mm5\n"
-
- " movq 0(%[s2_8]), %%mm3\n"
- " movq 8(%[s2_8]), %%mm4\n"
-
- " .p2align 4,,15 \n"
- "1:\n"
- /* load 128 */
- " pshufw $0x00, %%mm6, %%mm2\n"
-
- " movd 0(%[s1_np7]), %%mm0\n"
- " punpcklbw %%mm7, %%mm0\n"
- " movd 7(%[s1_np7]), %%mm1\n"
- " punpcklbw %%mm7, %%mm1\n"
- " paddw %%mm1, %%mm0\n"
- //" pshufw $0x00, %%mm3, %%mm1\n"
- //" pmullw %%mm1, %%mm0\n"
- //" paddw %%mm0, %%mm2\n"
- " psubw %%mm0, %%mm2\n"
-
- " movd 1(%[s1_np7]), %%mm0\n"
- " punpcklbw %%mm7, %%mm0\n"
- " movd 6(%[s1_np7]), %%mm1\n"
- " punpcklbw %%mm7, %%mm1\n"
- " paddw %%mm1, %%mm0\n"
- " pshufw $0x55*1, %%mm3, %%mm1\n"
- " pmullw %%mm1, %%mm0\n"
- " paddw %%mm0, %%mm2\n"
-
- " movd 2(%[s1_np7]), %%mm0\n"
- " punpcklbw %%mm7, %%mm0\n"
- " movd 5(%[s1_np7]), %%mm1\n"
- " punpcklbw %%mm7, %%mm1\n"
- " paddw %%mm1, %%mm0\n"
- " pshufw $0x55*2, %%mm3, %%mm1\n"
- " pmullw %%mm1, %%mm0\n"
- " paddw %%mm0, %%mm2\n"
-
- " movd 3(%[s1_np7]), %%mm0\n"
- " punpcklbw %%mm7, %%mm0\n"
- " movd 4(%[s1_np7]), %%mm1\n"
- " punpcklbw %%mm7, %%mm1\n"
- " paddw %%mm1, %%mm0\n"
- " pshufw $0x55*3, %%mm3, %%mm1\n"
- " pmullw %%mm1, %%mm0\n"
- " paddw %%mm0, %%mm2\n"
-
- " psraw %%mm5, %%mm2\n"
- " pmaxsw %%mm7, %%mm2\n"
- " packuswb %%mm2, %%mm2\n"
- " movd %%mm2, 0(%[d])\n"
- " addl $4, %[d]\n"
- " addl $4, %[s1_np7]\n"
- " decl %[n]\n"
- " jnz 1b\n"
- " emms\n"
- : [d] "+r" (d),
- [s1_np7] "+r" (s1_np7),
- [n] "+m" (n)
- : [s2_8] "r" (s2_8),
- [s3_2] "r" (s3_2)
- : "ecx");
-}
-OIL_DEFINE_IMPL_FULL (mas8_u8_sym_mmx_3, mas8_u8_sym_l15, OIL_IMPL_FLAG_MMX|OIL_IMPL_FLAG_MMXEXT);
-
-void
-mas8_u8_sym_mmx_41 (uint8_t *d, const uint8_t *s1_np7, const int16_t *s2_8,
- const int16_t *s3_2, int n)
-{
- int j;
- int x;
- int16_t tmp[16];
-
- while(n&3) {
- x = 0;
- for(j=0;j<8;j++){
- x += s1_np7[j] * s2_8[j];
- }
- *d = CLAMP((x + s3_2[0])>>s3_2[1],0,255);
- d++;
- s1_np7++;
- n--;
- }
-
- if (n == 0) return;
- n>>=2;
- __asm__ __volatile__("\n"
- " pxor %%mm7, %%mm7\n"
-
- " movd (%[s3_2]), %%mm6\n"
-
- " movzwl 2(%[s3_2]), %%ecx\n"
- " movd %%ecx, %%mm5\n"
-
- " movq 0(%[s2_8]), %%mm3\n"
- " pshufw $0x55*0, %%mm3, %%mm1\n"
- " movq %%mm1, 0(%[coeff])\n"
- " pshufw $0x55*1, %%mm3, %%mm1\n"
- " movq %%mm1, 8(%[coeff])\n"
- " pshufw $0x55*2, %%mm3, %%mm1\n"
- " movq %%mm1, 16(%[coeff])\n"
- " pshufw $0x55*3, %%mm3, %%mm1\n"
- " movq %%mm1, 24(%[coeff])\n"
- :
- : [s2_8] "r" (s2_8),
- [s3_2] "r" (s3_2),
- [coeff] "r" (tmp)
- : "ecx");
-
- __asm__ __volatile__("\n"
- " .p2align 4,,15 \n"
- "1:\n"
- /* load 128 */
- " pshufw $0x00, %%mm6, %%mm2\n"
-
- " movd 0(%[s1_np7]), %%mm0\n"
- " punpcklbw %%mm7, %%mm0\n"
- " movd 7(%[s1_np7]), %%mm1\n"
- " punpcklbw %%mm7, %%mm1\n"
- " paddw %%mm1, %%mm0\n"
- " pmullw 0(%[coeff]), %%mm0\n"
- " paddw %%mm0, %%mm2\n"
-
- " movd 1(%[s1_np7]), %%mm0\n"
- " punpcklbw %%mm7, %%mm0\n"
- " movd 6(%[s1_np7]), %%mm1\n"
- " punpcklbw %%mm7, %%mm1\n"
- " paddw %%mm1, %%mm0\n"
- " pmullw 8(%[coeff]), %%mm0\n"
- " paddw %%mm0, %%mm2\n"
-
- " movd 2(%[s1_np7]), %%mm0\n"
- " punpcklbw %%mm7, %%mm0\n"
- " movd 5(%[s1_np7]), %%mm1\n"
- " punpcklbw %%mm7, %%mm1\n"
- " paddw %%mm1, %%mm0\n"
- " pmullw 16(%[coeff]), %%mm0\n"
- " paddw %%mm0, %%mm2\n"
-
- " movd 3(%[s1_np7]), %%mm0\n"
- " punpcklbw %%mm7, %%mm0\n"
- " movd 4(%[s1_np7]), %%mm1\n"
- " punpcklbw %%mm7, %%mm1\n"
- " paddw %%mm1, %%mm0\n"
- " pmullw 24(%[coeff]), %%mm0\n"
- " paddw %%mm0, %%mm2\n"
-
- " psraw %%mm5, %%mm2\n"
- " pmaxsw %%mm7, %%mm2\n"
- " packuswb %%mm2, %%mm2\n"
- " movd %%mm2, 0(%[d])\n"
- " addl $4, %[d]\n"
- " addl $4, %[s1_np7]\n"
- " decl %[n]\n"
- " jnz 1b\n"
- " emms\n"
- : [d] "+r" (d),
- [s1_np7] "+r" (s1_np7),
- [n] "+m" (n)
- : [s2_8] "r" (s2_8),
- [coeff] "r" (tmp)
- : "ecx");
-}
-OIL_DEFINE_IMPL_FULL (mas8_u8_sym_mmx_41, mas8_u8_sym_l15, OIL_IMPL_FLAG_MMX|OIL_IMPL_FLAG_MMXEXT);
-
-
-#define PSHUFW_3210 "0xe4"
-#define PSHUFW_0123 "0x1b"
-
-void
-mas8_u8_sym_mmx_5 (uint8_t *d, const uint8_t *s1_np7, const int16_t *s2_8,
- const int16_t *s3_2, int n)
-{
- if (n==0) return;
- __asm__ __volatile__("\n"
- " pxor %%mm7, %%mm7\n"
-
- " movzwl 0(%[s3_2]), %%ecx\n"
- " movd %%ecx, %%mm6\n"
- " pshufw $0x44, %%mm6, %%mm6\n" // 01 00 01 00
-
- " movzwl 2(%[s3_2]), %%ecx\n"
- " movd %%ecx, %%mm5\n"
-
- " cmpl $0, %[n]\n"
- " jz 2f\n"
-
- "1:\n"
- " movd 0(%[s1_np7]), %%mm0\n"
- " punpcklbw %%mm7, %%mm0\n"
-#if 1
- " movd 4(%[s1_np7]), %%mm1\n"
- " punpcklbw %%mm7, %%mm1\n"
- " pshufw $0x1b, %%mm1, %%mm1\n" // 00 01 10 11
- " paddw %%mm1, %%mm0\n"
- " pmaddwd 0(%[s2_8]), %%mm0\n"
-#else
- " pmaddwd 0(%[s2_8]), %%mm0\n"
-
- " movd 4(%[s1_np7]), %%mm1\n"
- " punpcklbw %%mm7, %%mm1\n"
- " pmaddwd 8(%[s2_8]), %%mm1\n"
- " paddd %%mm1, %%mm0\n"
-#endif
-
- " pshufw $0xee, %%mm0, %%mm1\n" // 11 10 11 10
- " paddd %%mm1, %%mm0\n"
- " paddd %%mm6, %%mm0\n"
-
- " psrad %%mm5, %%mm0\n"
- " pmaxsw %%mm7, %%mm0\n"
- " packuswb %%mm0, %%mm0\n"
- " movd %%mm0, %%ecx\n"
- " movb %%cl,0(%[d])\n"
-
- " addl $1, %[d]\n"
- " addl $1, %[s1_np7]\n"
- " decl %[n]\n"
- " jnz 1b\n"
-
- "2:\n"
- " emms\n"
- : [d] "+r" (d),
- [s1_np7] "+r" (s1_np7),
- [n] "+m" (n)
- : [s2_8] "r" (s2_8),
- [s3_2] "r" (s3_2)
- : "ecx");
-}
-OIL_DEFINE_IMPL_FULL (mas8_u8_sym_mmx_5, mas8_u8_sym_l15, OIL_IMPL_FLAG_MMX|OIL_IMPL_FLAG_MMXEXT);
-
-void
-mas8_u8_sym_mmx_6 (uint8_t *d, const uint8_t *s1_np7, const int16_t *s2_8,
- const int16_t *s3_2, int n)
-{
- int8_t coeff[8];
- int8_t *ack;
- int i;
-
- for(i=0;i<8;i++){
- //coeff[i] = s2_8[i];
- coeff[i] = i;
- }
- ack = coeff;
-
- if (n==0) return;
- __asm__ __volatile__("\n"
- " pxor %%mm7, %%mm7\n"
-
- " movzwl 0(%[s3_2]), %%ecx\n"
- " movd %%ecx, %%mm6\n"
- " pshufw $0x44, %%mm6, %%mm6\n" // 01 00 01 00
-
- " movzwl 2(%[s3_2]), %%ecx\n"
- " movd %%ecx, %%mm5\n"
-
- " movq 0(%[s2_8]), %%mm4\n"
- " packsswb 8(%[s2_8]), %%mm4\n"
-
- "1:\n"
- " movq 0(%[s1_np7]), %%mm0\n"
- " pmaddubsw %%mm4, %%mm0\n"
-
-#if 1
- " pshufw $0xee, %%mm0, %%mm1\n" // 11 10 11 10
- " paddw %%mm1, %%mm0\n"
- " pshufw $0x55, %%mm0, %%mm1\n" // 01 01 01 01
- " paddw %%mm1, %%mm0\n"
-#else
- " phaddw %%mm0, %%mm0\n"
- " phaddw %%mm0, %%mm0\n"
-#endif
-
- " paddw %%mm6, %%mm0\n"
- " psraw %%mm5, %%mm0\n"
- " pmaxsw %%mm7, %%mm0\n"
- " packuswb %%mm0, %%mm0\n"
- " movd %%mm0, %%ecx\n"
- " movb %%cl,0(%[d])\n"
-
- " addl $1, %[d]\n"
- " addl $1, %[s1_np7]\n"
- " decl %[n]\n"
- " jnz 1b\n"
-
- " emms\n"
- : [d] "+r" (d),
- [s1_np7] "+r" (s1_np7),
- [n] "+m" (n)
- : [s2_8] "r" (s2_8),
- [s3_2] "r" (s3_2)
- : "ecx");
-}
-OIL_DEFINE_IMPL_FULL (mas8_u8_sym_mmx_6, mas8_u8_sym_l15, OIL_IMPL_FLAG_MMX|OIL_IMPL_FLAG_MMXEXT|OIL_IMPL_FLAG_SSSE3);
-
-#ifdef ENABLE_BROKEN_IMPLS
-/* This only works for the taps array: -1, 3, -7, 21, 21, -7, 3, -1 */
-void
-mas8_u8_supersym_mmx (uint8_t *d, const uint8_t *s1_np7, const int16_t *s2_8,
- const int16_t *s3_2, int n)
-{
- int j;
- int x;
-
- while(n&3) {
- x = 0;
- for(j=0;j<8;j++){
- x += s1_np7[j] * s2_8[j];
- }
- *d = CLAMP((x + s3_2[0])>>s3_2[1],0,255);
- d++;
- s1_np7++;
- n--;
- }
-
- n>>=2;
- __asm__ __volatile__("\n"
- " pxor %%mm7, %%mm7\n"
-
- " movd (%[s3_2]), %%mm6\n"
- " pshufw $0x00, %%mm6, %%mm6\n"
-
- " movzwl 2(%[s3_2]), %%ecx\n"
- " movd %%ecx, %%mm5\n"
-
- " movq 0(%[s2_8]), %%mm3\n"
- " movq 8(%[s2_8]), %%mm4\n"
-
- "1:\n"
- " movd 0(%[s1_np7]), %%mm0\n"
- " punpcklbw %%mm7, %%mm0\n"
- " movd 7(%[s1_np7]), %%mm1\n"
- " punpcklbw %%mm7, %%mm1\n"
- " paddw %%mm1, %%mm0\n"
-
- " movd 1(%[s1_np7]), %%mm2\n"
- " punpcklbw %%mm7, %%mm2\n"
- " movd 6(%[s1_np7]), %%mm3\n"
- " punpcklbw %%mm7, %%mm3\n"
- " paddw %%mm3, %%mm2\n"
-
- " paddw %%mm2, %%mm0\n"
- " psllw $2, %%mm2\n"
- " psubw %%mm0, %%mm2\n"
- " movq %%mm2, %%mm4\n"
-
- " movd 2(%[s1_np7]), %%mm0\n"
- " punpcklbw %%mm7, %%mm0\n"
- " movd 5(%[s1_np7]), %%mm1\n"
- " punpcklbw %%mm7, %%mm1\n"
- " paddw %%mm1, %%mm0\n"
-
- " movd 3(%[s1_np7]), %%mm2\n"
- " punpcklbw %%mm7, %%mm2\n"
- " movd 4(%[s1_np7]), %%mm3\n"
- " punpcklbw %%mm7, %%mm3\n"
- " paddw %%mm3, %%mm2\n"
-
- " paddw %%mm2, %%mm0\n"
- " psllw $2, %%mm2\n"
- " psubw %%mm0, %%mm2\n"
-
- " psubw %%mm2, %%mm4\n"
- " psllw $3, %%mm2\n"
- " paddw %%mm4, %%mm2\n"
-
- " paddw %%mm6, %%mm2\n"
-
- " psraw %%mm5, %%mm2\n"
- " pmaxsw %%mm7, %%mm2\n"
- " packuswb %%mm2, %%mm2\n"
- " movd %%mm2, 0(%[d])\n"
- " addl $4, %[d]\n"
- " addl $4, %[s1_np7]\n"
- " decl %[n]\n"
- " jnz 1b\n"
- " emms\n"
- : [d] "+r" (d),
- [s1_np7] "+r" (s1_np7),
- [n] "+m" (n)
- : [s2_8] "r" (s2_8),
- [s3_2] "r" (s3_2)
- : "ecx");
-}
-OIL_DEFINE_IMPL_FULL (mas8_u8_supersym_mmx, mas8_u8_sym_l15, OIL_IMPL_FLAG_MMX|OIL_IMPL_FLAG_MMXEXT);
-#endif
-
-void
-mas12_addc_rshift_decim2_u8_mmx_4 (uint8_t *d, const uint8_t *s1_2xnp11,
- const int16_t *s2_12, const int16_t *s3_2, int n)
-{
- if (n == 0) return;
- __asm__ __volatile__("\n"
- " pxor %%mm7, %%mm7\n"
-
- " movzwl 0(%[s3_2]), %%ecx\n"
- " movd %%ecx, %%mm6\n"
-
- " movzwl 2(%[s3_2]), %%ecx\n"
- " movd %%ecx, %%mm5\n"
-
- "1:\n"
- " movd 0(%[s1_2xnp11]), %%mm0\n"
- " punpcklbw %%mm7, %%mm0\n"
- " pmaddwd 0(%[s2_12]), %%mm0\n"
-
- " movd 4(%[s1_2xnp11]), %%mm1\n"
- " punpcklbw %%mm7, %%mm1\n"
- " pmaddwd 8(%[s2_12]), %%mm1\n"
- " paddd %%mm1, %%mm0\n"
-
- " movd 8(%[s1_2xnp11]), %%mm1\n"
- " punpcklbw %%mm7, %%mm1\n"
- " pmaddwd 16(%[s2_12]), %%mm1\n"
- " paddd %%mm1, %%mm0\n"
-
- " movq %%mm0, %%mm1\n"
- " psrlq $32, %%mm0\n"
- " paddd %%mm1, %%mm0\n"
- " paddd %%mm6, %%mm0\n"
-
- " psrad %%mm5, %%mm0\n"
- " pmaxsw %%mm7, %%mm0\n"
- " packuswb %%mm0, %%mm0\n"
- " movd %%mm0, %%ecx\n"
- " movb %%cl,0(%[d])\n"
-
- " addl $1, %[d]\n"
- " addl $2, %[s1_2xnp11]\n"
- " decl %[n]\n"
- " jnz 1b\n"
- " emms\n"
- : [d] "+r" (d),
- [s1_2xnp11] "+r" (s1_2xnp11),
- [n] "+m" (n)
- : [s2_12] "r" (s2_12),
- [s3_2] "r" (s3_2)
- : "ecx");
-}
-OIL_DEFINE_IMPL_FULL (mas12_addc_rshift_decim2_u8_mmx_4,
- mas12_addc_rshift_decim2_u8, OIL_IMPL_FLAG_MMX|OIL_IMPL_FLAG_MMXEXT);
-
-#if 0
-void
-mas8_addc_rshift_decim2_u8_mmx_4 (uint8_t *d, const uint8_t *s1_2xnp9,
- const int16_t *s2_8, const int16_t *s3_2, int n)
-{
- if (n == 0) return;
- __asm__ __volatile__("\n"
- " pxor %%mm7, %%mm7\n"
-
- " movzwl 0(%[s3_2]), %%ecx\n"
- " movd %%ecx, %%mm6\n"
-
- " movzwl 2(%[s3_2]), %%ecx\n"
- " movd %%ecx, %%mm5\n"
-
- "1:\n"
- " movd 0(%[s1_2xnp9]), %%mm0\n"
- " punpcklbw %%mm7, %%mm0\n"
- " pmaddwd 0(%[s2_8]), %%mm0\n"
-
- " movd 4(%[s1_2xnp9]), %%mm1\n"
- " punpcklbw %%mm7, %%mm1\n"
- " pmaddwd 8(%[s2_8]), %%mm1\n"
- " paddd %%mm1, %%mm0\n"
-
- " movq %%mm0, %%mm1\n"
- " psrlq $32, %%mm0\n"
- " paddd %%mm1, %%mm0\n"
- " paddd %%mm6, %%mm0\n"
-
- " psrad %%mm5, %%mm0\n"
- " pmaxsw %%mm7, %%mm0\n"
- " packuswb %%mm0, %%mm0\n"
- " movd %%mm0, %%ecx\n"
- " movb %%cl,0(%[d])\n"
-
- " addl $1, %[d]\n"
- " addl $2, %[s1_2xnp9]\n"
- " decl %[n]\n"
- " jnz 1b\n"
- " emms\n"
- : [d] "+r" (d),
- [s1_2xnp9] "+r" (s1_2xnp9),
- [n] "+m" (n)
- : [s2_8] "r" (s2_8),
- [s3_2] "r" (s3_2)
- : "ecx");
-}
-OIL_DEFINE_IMPL_FULL (mas8_addc_rshift_decim2_u8_mmx_4,
- mas8_addc_rshift_decim2_u8, OIL_IMPL_FLAG_MMX|OIL_IMPL_FLAG_MMXEXT);
-
-#endif
-
-void
-mas8_across_u8_mmx_3 (uint8_t *d, const uint8_t *s1_nx8, int ss1,
- const int16_t *s2_8, const int16_t *s3_2, int n)
-{
- int i;
- int x;
-
- while(n&3) {
- x = 0;
- for(i=0;i<8;i++){
- x += OIL_GET(s1_nx8, i*ss1, uint8_t)*s2_8[i];
- }
- *d = CLAMP((x + s3_2[0])>>s3_2[1],0,255);
- d++;
- s1_nx8++;
- n--;
- }
-
- if (n == 0) return;
- n>>=2;
- __asm__ __volatile__("\n"
- " pxor %%mm7, %%mm7\n"
-
- " movd (%[s3_2]), %%mm6\n"
-
- " movzwl 2(%[s3_2]), %%ecx\n"
- " movd %%ecx, %%mm5\n"
-
- " movq 0(%[s2_8]), %%mm3\n"
- " movq 8(%[s2_8]), %%mm4\n"
- :
- : [s2_8] "r" (s2_8),
- [s3_2] "r" (s3_2)
- : "ecx");
-
- while (n > 0) {
- const uint8_t *p = s1_nx8;
- __asm__ __volatile__("\n"
- "1:\n"
- /* load 128 */
- " pshufw $0x00, %%mm6, %%mm2\n"
-
- " movd 0(%[p]), %%mm0\n"
- " addl %[ss1], %[p]\n"
- " punpcklbw %%mm7, %%mm0\n"
- " pshufw $0x00, %%mm3, %%mm1\n"
- " pmullw %%mm1, %%mm0\n"
- " paddw %%mm0, %%mm2\n"
-
- " movd 0(%[p]), %%mm0\n"
- " addl %[ss1], %[p]\n"
- " punpcklbw %%mm7, %%mm0\n"
- " pshufw $0x55*1, %%mm3, %%mm1\n"
- " pmullw %%mm1, %%mm0\n"
- " paddw %%mm0, %%mm2\n"
-
- " movd 0(%[p]), %%mm0\n"
- " addl %[ss1], %[p]\n"
- " punpcklbw %%mm7, %%mm0\n"
- " pshufw $0x55*2, %%mm3, %%mm1\n"
- " pmullw %%mm1, %%mm0\n"
- " paddw %%mm0, %%mm2\n"
-
- " movd 0(%[p]), %%mm0\n"
- " addl %[ss1], %[p]\n"
- " punpcklbw %%mm7, %%mm0\n"
- " pshufw $0x55*3, %%mm3, %%mm1\n"
- " pmullw %%mm1, %%mm0\n"
- " paddw %%mm0, %%mm2\n"
-
- " movd 0(%[p]), %%mm0\n"
- " addl %[ss1], %[p]\n"
- " punpcklbw %%mm7, %%mm0\n"
- " pshufw $0x00, %%mm4, %%mm1\n"
- " pmullw %%mm1, %%mm0\n"
- " paddw %%mm0, %%mm2\n"
-
- " movd 0(%[p]), %%mm0\n"
- " addl %[ss1], %[p]\n"
- " punpcklbw %%mm7, %%mm0\n"
- " pshufw $0x55*1, %%mm4, %%mm1\n"
- " pmullw %%mm1, %%mm0\n"
- " paddw %%mm0, %%mm2\n"
-
- " movd 0(%[p]), %%mm0\n"
- " addl %[ss1], %[p]\n"
- " punpcklbw %%mm7, %%mm0\n"
- " pshufw $0x55*2, %%mm4, %%mm1\n"
- " pmullw %%mm1, %%mm0\n"
- " paddw %%mm0, %%mm2\n"
-
- " movd 0(%[p]), %%mm0\n"
- " addl %[ss1], %[p]\n"
- " punpcklbw %%mm7, %%mm0\n"
- " pshufw $0x55*3, %%mm4, %%mm1\n"
- " pmullw %%mm1, %%mm0\n"
- " paddw %%mm0, %%mm2\n"
-
- " psraw %%mm5, %%mm2\n"
- " pmaxsw %%mm7, %%mm2\n"
- " packuswb %%mm2, %%mm2\n"
- " movd %%mm2, 0(%[d])\n"
- : [p] "+r" (p)
- : [d] "r" (d), [ss1] "r" (ss1));
- d+=4;
- s1_nx8+=4;
- n--;
- }
-
- asm volatile ("emms");
-}
-OIL_DEFINE_IMPL_FULL (mas8_across_u8_mmx_3, mas8_across_u8, OIL_IMPL_FLAG_MMX|OIL_IMPL_FLAG_MMXEXT);
-
diff --git a/liboil/i386/wavelet.c b/liboil/i386/wavelet.c
index ec6fa3c..16b779a 100644
--- a/liboil/i386/wavelet.c
+++ b/liboil/i386/wavelet.c
@@ -2049,730 +2049,3 @@ lshift_s16_mmx_2(int16_t *d1, int16_t *s1, int16_t *s3_1, int n)
OIL_DEFINE_IMPL_FULL (lshift_s16_mmx_2, lshift_s16, OIL_IMPL_FLAG_MMX);
-void
-multiply_and_acc_6xn_s16_u8_mmx (int16_t *i1, int is1, int16_t *s1,
- int ss1, uint8_t *s2, int ss2, int n)
-{
- /* FIXME this reads outside the arrays. Bad. */
- if (n==0) return;
- __asm__ __volatile__ ("\n"
- " pxor %%mm7, %%mm7\n"
- "1:\n"
- " movd 0(%2), %%mm0\n"
- " punpcklbw %%mm7, %%mm0\n"
- " pmullw 0(%1), %%mm0\n"
- " paddw 0(%0), %%mm0\n"
- " movq %%mm0, 0(%0)\n"
- " movd 4(%2), %%mm1\n"
- " punpcklbw %%mm7, %%mm1\n"
- " pmullw 8(%1), %%mm1\n"
- " paddw 8(%0), %%mm1\n"
- " movd %%mm1, 8(%0)\n"
-
- " addl %4, %0\n"
- " addl %5, %1\n"
- " addl %6, %2\n"
- " decl %3\n"
- " jnz 1b\n"
- " emms\n"
- : "+r" (i1), "+r" (s1), "+r" (s2), "+r" (n)
- : "m" (is1), "m" (ss1), "m" (ss2)
- );
-}
-OIL_DEFINE_IMPL_FULL (multiply_and_acc_6xn_s16_u8_mmx,
- multiply_and_acc_6xn_s16_u8, OIL_IMPL_FLAG_MMX);
-
-void
-multiply_and_acc_8xn_s16_u8_mmx (int16_t *i1, int is1, int16_t *s1,
- int ss1, uint8_t *s2, int ss2, int n)
-{
- if (n==0) return;
- __asm__ __volatile__ ("\n"
- " pxor %%mm7, %%mm7\n"
- "1:\n"
- " movd 0(%2), %%mm0\n"
- " punpcklbw %%mm7, %%mm0\n"
- " pmullw 0(%1), %%mm0\n"
- " paddw 0(%0), %%mm0\n"
- " movq %%mm0, 0(%0)\n"
- " movd 4(%2), %%mm1\n"
- " punpcklbw %%mm7, %%mm1\n"
- " pmullw 8(%1), %%mm1\n"
- " paddw 8(%0), %%mm1\n"
- " movq %%mm1, 8(%0)\n"
-
- " addl %4, %0\n"
- " addl %5, %1\n"
- " addl %6, %2\n"
- " decl %3\n"
- " jnz 1b\n"
- " emms\n"
- : "+r" (i1), "+r" (s1), "+r" (s2), "+r" (n)
- : "m" (is1), "m" (ss1), "m" (ss2)
- );
-}
-OIL_DEFINE_IMPL_FULL (multiply_and_acc_8xn_s16_u8_mmx,
- multiply_and_acc_8xn_s16_u8, OIL_IMPL_FLAG_MMX);
-
-void
-multiply_and_acc_16xn_s16_u8_mmx (int16_t *i1, int is1, int16_t *s1,
- int ss1, uint8_t *s2, int ss2, int n)
-{
- if (n==0) return;
- __asm__ __volatile__ ("\n"
- " pxor %%mm7, %%mm7\n"
- "1:\n"
- " movd 0(%2), %%mm0\n"
- " punpcklbw %%mm7, %%mm0\n"
- " pmullw 0(%1), %%mm0\n"
- " paddw 0(%0), %%mm0\n"
- " movq %%mm0, 0(%0)\n"
- " movd 4(%2), %%mm1\n"
- " punpcklbw %%mm7, %%mm1\n"
- " pmullw 8(%1), %%mm1\n"
- " paddw 8(%0), %%mm1\n"
- " movq %%mm1, 8(%0)\n"
- " movd 8(%2), %%mm2\n"
- " punpcklbw %%mm7, %%mm2\n"
- " pmullw 16(%1), %%mm2\n"
- " paddw 16(%0), %%mm2\n"
- " movq %%mm2, 16(%0)\n"
- " movd 12(%2), %%mm2\n"
- " punpcklbw %%mm7, %%mm2\n"
- " pmullw 24(%1), %%mm2\n"
- " paddw 24(%0), %%mm2\n"
- " movq %%mm2, 24(%0)\n"
-
- " addl %4, %0\n"
- " addl %5, %1\n"
- " addl %6, %2\n"
- " decl %3\n"
- " jnz 1b\n"
- " emms\n"
- : "+r" (i1), "+r" (s1), "+r" (s2), "+r" (n)
- : "m" (is1), "m" (ss1), "m" (ss2)
- );
-}
-OIL_DEFINE_IMPL_FULL (multiply_and_acc_16xn_s16_u8_mmx,
- multiply_and_acc_16xn_s16_u8, OIL_IMPL_FLAG_MMX);
-
-void
-multiply_and_acc_24xn_s16_u8_mmx (int16_t *i1, int is1, int16_t *s1,
- int ss1, uint8_t *s2, int ss2, int n)
-{
- if (n==0) return;
- __asm__ __volatile__ ("\n"
- " pxor %%mm7, %%mm7\n"
- "1:\n"
- " movd 0(%2), %%mm0\n"
- " punpcklbw %%mm7, %%mm0\n"
- " pmullw 0(%1), %%mm0\n"
- " paddw 0(%0), %%mm0\n"
- " movq %%mm0, 0(%0)\n"
- " movd 4(%2), %%mm1\n"
- " punpcklbw %%mm7, %%mm1\n"
- " pmullw 8(%1), %%mm1\n"
- " paddw 8(%0), %%mm1\n"
- " movq %%mm1, 8(%0)\n"
- " movd 8(%2), %%mm2\n"
- " punpcklbw %%mm7, %%mm2\n"
- " pmullw 16(%1), %%mm2\n"
- " paddw 16(%0), %%mm2\n"
- " movq %%mm2, 16(%0)\n"
- " movd 12(%2), %%mm2\n"
- " punpcklbw %%mm7, %%mm2\n"
- " pmullw 24(%1), %%mm2\n"
- " paddw 24(%0), %%mm2\n"
- " movq %%mm2, 24(%0)\n"
- " movd 16(%2), %%mm2\n"
- " punpcklbw %%mm7, %%mm2\n"
- " pmullw 32(%1), %%mm2\n"
- " paddw 32(%0), %%mm2\n"
- " movq %%mm2, 32(%0)\n"
- " movd 20(%2), %%mm2\n"
- " punpcklbw %%mm7, %%mm2\n"
- " pmullw 40(%1), %%mm2\n"
- " paddw 40(%0), %%mm2\n"
- " movq %%mm2, 40(%0)\n"
-
- " addl %4, %0\n"
- " addl %5, %1\n"
- " addl %6, %2\n"
- " decl %3\n"
- " jnz 1b\n"
- " emms\n"
- : "+r" (i1), "+r" (s1), "+r" (s2), "+r" (n)
- : "m" (is1), "m" (ss1), "m" (ss2)
- );
-}
-OIL_DEFINE_IMPL_FULL (multiply_and_acc_24xn_s16_u8_mmx,
- multiply_and_acc_24xn_s16_u8, OIL_IMPL_FLAG_MMX);
-
-
-#if 0
-void
-mas2_add_s16_sse (int16_t *d1, int16_t *s1, int16_t *s2, int16_t *s3_2,
- int16_t *s4_2, int n)
-{
- int shift = s4_2[1];
-
- while (n&7) {
- int x;
-
- x = s4_2[0] + s2[0]*s3_2[0] + s2[1]*s3_2[1];
- x >>= s4_2[1];
- d1[0] = s1[0] + x;
-
- d1++;
- s1++;
- s2++;
- n--;
- }
- if (n==0) return;
-
- n>>=3;
- asm volatile ("\n"
- " movzwl 0(%0), %%ecx\n"
- " movd %%ecx, %%xmm7\n"
- " pshuflw $0x00, %%xmm7, %%xmm7\n"
- " pshufd $0x00, %%xmm7, %%xmm7\n"
- " movzwl 2(%0), %%ecx\n"
- " movd %%ecx, %%xmm6\n"
- " pshuflw $0x00, %%xmm6, %%xmm6\n"
- " pshufd $0x00, %%xmm6, %%xmm6\n"
- " movzwl 0(%1), %%ecx\n"
- " movd %%ecx, %%xmm5\n"
- " pshuflw $0x44, %%xmm5, %%xmm5\n"
- " pshufd $0x00, %%xmm5, %%xmm5\n"
- :: "r" (s3_2), "r" (s4_2)
- : "ecx"
- );
- asm volatile ("\n"
- "1:\n"
- " movdqu 0(%2), %%xmm0\n" // mm0 = s0, s1, s2, s3
- " movdqu 0(%2), %%xmm1\n" // mm1 = s0, s1, s2, s3
- " pmullw %%xmm7, %%xmm0\n" // mm0 = lo(s0*a0), lo(s1*a0), ...
- " pmulhw %%xmm7, %%xmm1\n" // mm1 = hi(s0*a0), hi(s1*a0), ...
- " movdqu %%xmm0, %%xmm2\n" // mm2 = lo(s0*a0), lo(s1*a0), ...
- " punpcklwd %%xmm1, %%xmm0\n" // mm0 = s0*a0, s1*a0
- " punpckhwd %%xmm1, %%xmm2\n" // mm2 = s2*a0, s3*a0
- " movdqu %%xmm2, %%xmm1\n" // mm1 = s2*a0, s3*a0
-
- " movdqu 2(%2), %%xmm2\n"
- " movdqu 2(%2), %%xmm3\n"
- " pmullw %%xmm6, %%xmm2\n"
- " pmulhw %%xmm6, %%xmm3\n"
- " movdqu %%xmm2, %%xmm4\n"
- " punpcklwd %%xmm3, %%xmm2\n" // mm2 = s1*a1, s2*a1
- " punpckhwd %%xmm3, %%xmm4\n" // mm4 = s3*a1, s4*a1
- " movdqu %%xmm4, %%xmm3\n" // mm3 = s3*a1, s4*a1
-
- " paddd %%xmm3, %%xmm1\n" // mm1 = s2*a0 + s3*a1, ...
- " paddd %%xmm2, %%xmm0\n" // mm0 = s0*a0 + s1*a1, ...
-
- " paddd %%xmm5, %%xmm1\n" // mm1 = s2*a0 + s3*a1 + offset, ...
- " paddd %%xmm5, %%xmm0\n" // mm0 = s0*a0 + s1*a1 + offset, ...
-
- " movd %4, %%xmm4\n"
- " psrad %%xmm4, %%xmm1\n" // mm1 = (s2*a0 + s3*a1 + offset)>>shift, ...
- " psrad %%xmm4, %%xmm0\n" // mm0 = (s0*a0 + s1*a1 + offset)>>shift, ...
-
- " packssdw %%xmm1, %%xmm0\n"
- " paddw 0(%1), %%xmm0\n"
- " movdqu %%xmm0, 0(%0)\n"
- " add $16, %0\n"
- " add $16, %1\n"
- " add $16, %2\n"
- " decl %3\n"
- " jnz 1b\n"
- : "+r" (d1), "+r" (s1), "+r" (s2), "+r" (n)
- : "r" (shift)
- );
-}
-OIL_DEFINE_IMPL_FULL (mas2_add_s16_sse, mas2_add_s16, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT);
-#endif
-
-
-
-
-void
-add2_rshift_add_s16_mmx (int16_t *d1, int16_t *s1, int16_t *s2, int16_t *s3,
- int16_t *s4_2, int n)
-{
- while (n&3) {
- int x;
-
- x = s4_2[0] + s2[0] + s3[0];
- x >>= s4_2[1];
- d1[0] = s1[0] + x;
-
- d1++;
- s1++;
- s2++;
- s3++;
- n--;
- }
- if (n==0) return;
-
- n>>=2;
- asm volatile ("\n"
- " movd 0(%[s4_2]), %%mm4\n"
- " pshufw $0x00, %%mm4, %%mm4\n"
- " movzwl 2(%[s4_2]), %%ecx\n"
- " movd %%ecx, %%mm5\n"
- :: [s4_2] "r" (s4_2)
- : "ecx");
- asm volatile ("\n"
- "1:\n"
- " movq %%mm4, %%mm0\n"
- " paddw 0(%[s2]), %%mm0\n"
- " paddw 0(%[s3]), %%mm0\n"
- " psraw %%mm5, %%mm0\n"
- " paddw 0(%[s1]), %%mm0\n"
- " movq %%mm0, 0(%[d1])\n"
-
- " add $8, %[d1]\n"
- " add $8, %[s1]\n"
- " add $8, %[s2]\n"
- " add $8, %[s3]\n"
- " decl %[n]\n"
- " jnz 1b\n"
- " emms\n"
- : [d1] "+r" (d1), [s1] "+r" (s1), [s2] "+r" (s2),
- [s3] "+r" (s3), [n] "+r" (n)
- :
- );
-}
-OIL_DEFINE_IMPL_FULL (add2_rshift_add_s16_mmx, add2_rshift_add_s16, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT);
-
-void
-add2_rshift_sub_s16_mmx (int16_t *d1, int16_t *s1, int16_t *s2, int16_t *s3,
- int16_t *s4_2, int n)
-{
- while (n&3) {
- int x;
-
- x = s4_2[0] + s2[0] + s3[0];
- x >>= s4_2[1];
- d1[0] = s1[0] - x;
-
- d1++;
- s1++;
- s2++;
- s3++;
- n--;
- }
- if (n==0) return;
-
- n>>=2;
- asm volatile ("\n"
- " movd 0(%[s4_2]), %%mm4\n"
- " pshufw $0x00, %%mm4, %%mm4\n"
- " movzwl 2(%[s4_2]), %%ecx\n"
- " movd %%ecx, %%mm5\n"
- :: [s4_2] "r" (s4_2)
- : "ecx");
- asm volatile ("\n"
- "1:\n"
- " movq %%mm4, %%mm0\n"
- " paddw 0(%[s2]), %%mm0\n"
- " paddw 0(%[s3]), %%mm0\n"
- " psraw %%mm5, %%mm0\n"
- " movq 0(%[s1]), %%mm1\n"
- " psubw %%mm0, %%mm1\n"
- " movq %%mm1, 0(%[d1])\n"
-
- " add $8, %[d1]\n"
- " add $8, %[s1]\n"
- " add $8, %[s2]\n"
- " add $8, %[s3]\n"
- " decl %[n]\n"
- " jnz 1b\n"
- " emms\n"
- : [d1] "+r" (d1), [s1] "+r" (s1), [s2] "+r" (s2),
- [s3] "+r" (s3), [n] "+r" (n)
- :
- );
-}
-OIL_DEFINE_IMPL_FULL (add2_rshift_sub_s16_mmx, add2_rshift_sub_s16, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT);
-
-void
-add2_rshift_add_s16_mmx_unroll2 (int16_t *d1, int16_t *s1, int16_t *s2, int16_t *s3,
- int16_t *s4_2, int n)
-{
- while (n&7) {
- int x;
-
- x = s4_2[0] + s2[0] + s3[0];
- x >>= s4_2[1];
- d1[0] = s1[0] + x;
-
- d1++;
- s1++;
- s2++;
- s3++;
- n--;
- }
- if (n==0) return;
-
- n>>=3;
- asm volatile ("\n"
- " movd 0(%[s4_2]), %%mm4\n"
- " pshufw $0x00, %%mm4, %%mm4\n"
- " movzwl 2(%[s4_2]), %%ecx\n"
- " movd %%ecx, %%mm5\n"
- :: [s4_2] "r" (s4_2)
- : "ecx");
- asm volatile ("\n"
- "1:\n"
- " movq %%mm4, %%mm0\n"
- " paddw 0(%[s2]), %%mm0\n"
- " paddw 0(%[s3]), %%mm0\n"
- " psraw %%mm5, %%mm0\n"
- " paddw 0(%[s1]), %%mm0\n"
- " movq %%mm0, 0(%[d1])\n"
-
- " movq %%mm4, %%mm0\n"
- " paddw 8(%[s2]), %%mm0\n"
- " paddw 8(%[s3]), %%mm0\n"
- " psraw %%mm5, %%mm0\n"
- " paddw 8(%[s1]), %%mm0\n"
- " movq %%mm0, 8(%[d1])\n"
-
- " add $16, %[d1]\n"
- " add $16, %[s1]\n"
- " add $16, %[s2]\n"
- " add $16, %[s3]\n"
- " decl %[n]\n"
- " jnz 1b\n"
- " emms\n"
- : [d1] "+r" (d1), [s1] "+r" (s1), [s2] "+r" (s2),
- [s3] "+r" (s3), [n] "+r" (n)
- :
- );
-}
-OIL_DEFINE_IMPL_FULL (add2_rshift_add_s16_mmx_unroll2, add2_rshift_add_s16, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT);
-
-void
-add2_rshift_sub_s16_mmx_unroll2 (int16_t *d1, int16_t *s1, int16_t *s2, int16_t *s3,
- int16_t *s4_2, int n)
-{
- while (n&7) {
- int x;
-
- x = s4_2[0] + s2[0] + s3[0];
- x >>= s4_2[1];
- d1[0] = s1[0] - x;
-
- d1++;
- s1++;
- s2++;
- s3++;
- n--;
- }
- if (n==0) return;
-
- n>>=3;
- asm volatile ("\n"
- " movd 0(%[s4_2]), %%mm4\n"
- " pshufw $0x00, %%mm4, %%mm4\n"
- " movzwl 2(%[s4_2]), %%ecx\n"
- " movd %%ecx, %%mm5\n"
- :: [s4_2] "r" (s4_2)
- : "ecx");
- asm volatile ("\n"
- "1:\n"
- " movq %%mm4, %%mm0\n"
- " paddw 0(%[s2]), %%mm0\n"
- " paddw 0(%[s3]), %%mm0\n"
- " psraw %%mm5, %%mm0\n"
- " movq 0(%[s1]), %%mm1\n"
- " psubw %%mm0, %%mm1\n"
- " movq %%mm1, 0(%[d1])\n"
-
- " movq %%mm4, %%mm0\n"
- " paddw 8(%[s2]), %%mm0\n"
- " paddw 8(%[s3]), %%mm0\n"
- " psraw %%mm5, %%mm0\n"
- " movq 8(%[s1]), %%mm1\n"
- " psubw %%mm0, %%mm1\n"
- " movq %%mm1, 8(%[d1])\n"
-
- " add $16, %[d1]\n"
- " add $16, %[s1]\n"
- " add $16, %[s2]\n"
- " add $16, %[s3]\n"
- " decl %[n]\n"
- " jnz 1b\n"
- " emms\n"
- : [d1] "+r" (d1), [s1] "+r" (s1), [s2] "+r" (s2),
- [s3] "+r" (s3), [n] "+r" (n)
- :
- );
-}
-OIL_DEFINE_IMPL_FULL (add2_rshift_sub_s16_mmx_unroll2, add2_rshift_sub_s16, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT);
-
-void
-add2_rshift_add_s16_mmx_unroll4 (int16_t *d1, int16_t *s1, int16_t *s2, int16_t *s3,
- int16_t *s4_2, int n)
-{
- while (n&15) {
- int x;
-
- x = s4_2[0] + s2[0] + s3[0];
- x >>= s4_2[1];
- d1[0] = s1[0] + x;
-
- d1++;
- s1++;
- s2++;
- s3++;
- n--;
- }
- if (n==0) return;
-
- n>>=4;
- asm volatile ("\n"
- " movd 0(%[s4_2]), %%mm4\n"
- " pshufw $0x00, %%mm4, %%mm4\n"
- " movzwl 2(%[s4_2]), %%ecx\n"
- " movd %%ecx, %%mm5\n"
- :: [s4_2] "r" (s4_2)
- : "ecx");
- asm volatile ("\n"
- "1:\n"
- " movq %%mm4, %%mm0\n"
- " paddw 0(%[s2]), %%mm0\n"
- " paddw 0(%[s3]), %%mm0\n"
- " psraw %%mm5, %%mm0\n"
- " paddw 0(%[s1]), %%mm0\n"
- " movq %%mm0, 0(%[d1])\n"
-
- " movq %%mm4, %%mm0\n"
- " paddw 8(%[s2]), %%mm0\n"
- " paddw 8(%[s3]), %%mm0\n"
- " psraw %%mm5, %%mm0\n"
- " paddw 8(%[s1]), %%mm0\n"
- " movq %%mm0, 8(%[d1])\n"
-
- " movq %%mm4, %%mm0\n"
- " paddw 16(%[s2]), %%mm0\n"
- " paddw 16(%[s3]), %%mm0\n"
- " psraw %%mm5, %%mm0\n"
- " paddw 16(%[s1]), %%mm0\n"
- " movq %%mm0, 16(%[d1])\n"
-
- " movq %%mm4, %%mm0\n"
- " paddw 24(%[s2]), %%mm0\n"
- " paddw 24(%[s3]), %%mm0\n"
- " psraw %%mm5, %%mm0\n"
- " paddw 24(%[s1]), %%mm0\n"
- " movq %%mm0, 24(%[d1])\n"
-
- " add $32, %[d1]\n"
- " add $32, %[s1]\n"
- " add $32, %[s2]\n"
- " add $32, %[s3]\n"
- " decl %[n]\n"
- " jnz 1b\n"
- " emms\n"
- : [d1] "+r" (d1), [s1] "+r" (s1), [s2] "+r" (s2),
- [s3] "+r" (s3), [n] "+r" (n)
- :
- );
-}
-OIL_DEFINE_IMPL_FULL (add2_rshift_add_s16_mmx_unroll4, add2_rshift_add_s16, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT);
-
-void
-add2_rshift_sub_s16_mmx_unroll4 (int16_t *d1, int16_t *s1, int16_t *s2, int16_t *s3,
- int16_t *s4_2, int n)
-{
- while (n&15) {
- int x;
-
- x = s4_2[0] + s2[0] + s3[0];
- x >>= s4_2[1];
- d1[0] = s1[0] - x;
-
- d1++;
- s1++;
- s2++;
- s3++;
- n--;
- }
- if (n==0) return;
-
- n>>=4;
- asm volatile ("\n"
- " movd 0(%[s4_2]), %%mm4\n"
- " pshufw $0x00, %%mm4, %%mm4\n"
- " movzwl 2(%[s4_2]), %%ecx\n"
- " movd %%ecx, %%mm5\n"
- :: [s4_2] "r" (s4_2)
- : "ecx");
- asm volatile ("\n"
- "1:\n"
- " movq %%mm4, %%mm0\n"
- " paddw 0(%[s2]), %%mm0\n"
- " paddw 0(%[s3]), %%mm0\n"
- " psraw %%mm5, %%mm0\n"
- " movq 0(%[s1]), %%mm1\n"
- " psubw %%mm0, %%mm1\n"
- " movq %%mm1, 0(%[d1])\n"
-
- " movq %%mm4, %%mm2\n"
- " paddw 8(%[s2]), %%mm2\n"
- " paddw 8(%[s3]), %%mm2\n"
- " psraw %%mm5, %%mm2\n"
- " movq 8(%[s1]), %%mm3\n"
- " psubw %%mm2, %%mm3\n"
- " movq %%mm3, 8(%[d1])\n"
-
- " movq %%mm4, %%mm0\n"
- " paddw 16(%[s2]), %%mm0\n"
- " paddw 16(%[s3]), %%mm0\n"
- " psraw %%mm5, %%mm0\n"
- " movq 16(%[s1]), %%mm1\n"
- " psubw %%mm0, %%mm1\n"
- " movq %%mm1, 16(%[d1])\n"
-
- " movq %%mm4, %%mm2\n"
- " paddw 24(%[s2]), %%mm2\n"
- " paddw 24(%[s3]), %%mm2\n"
- " psraw %%mm5, %%mm2\n"
- " movq 24(%[s1]), %%mm3\n"
- " psubw %%mm2, %%mm3\n"
- " movq %%mm3, 24(%[d1])\n"
-
- " add $32, %[d1]\n"
- " add $32, %[s1]\n"
- " add $32, %[s2]\n"
- " add $32, %[s3]\n"
- " decl %[n]\n"
- " jnz 1b\n"
- " emms\n"
- : [d1] "+r" (d1), [s1] "+r" (s1), [s2] "+r" (s2),
- [s3] "+r" (s3), [n] "+r" (n)
- :
- );
-}
-OIL_DEFINE_IMPL_FULL (add2_rshift_sub_s16_mmx_unroll4, add2_rshift_sub_s16, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT);
-
-
-void
-add2_rshift_add_s16_sse (int16_t *d1, int16_t *s1, int16_t *s2, int16_t *s3,
- int16_t *s4_2, int n)
-{
- while (n&7) {
- int x;
-
- x = s4_2[0] + s2[0] + s3[0];
- x >>= s4_2[1];
- d1[0] = s1[0] + x;
-
- d1++;
- s1++;
- s2++;
- s3++;
- n--;
- }
- if (n==0) return;
-
- n>>=3;
- asm volatile ("\n"
- " movd 0(%[s4_2]), %%xmm4\n"
- " pshuflw $0x00, %%xmm4, %%xmm4\n"
- " pshufd $0x00, %%xmm4, %%xmm4\n"
- " movzwl 2(%[s4_2]), %%ecx\n"
- " movd %%ecx, %%xmm5\n"
- :: [s4_2] "r" (s4_2)
- : "ecx");
- asm volatile ("\n"
- "1:\n"
-#if 0
- " movdqu %%xmm4, %%xmm0\n"
- " movdqu 0(%[s2]), %%xmm1\n"
- " paddw %%xmm1, %%xmm0\n"
- " movdqu 0(%[s3]), %%xmm2\n"
- " paddw %%xmm2, %%xmm0\n"
- " psraw %%xmm5, %%xmm0\n"
- " movdqu 0(%[d1]), %%xmm1\n"
- " paddw %%xmm1, %%xmm0\n"
- " movdqu %%xmm0, 0(%[d1])\n"
-#endif
- " movdqu %%xmm4, %%xmm0\n"
- " movdqu 0(%[s2]), %%xmm1\n"
- " paddw %%xmm1, %%xmm0\n"
- " movdqu 0(%[s3]), %%xmm2\n"
- " paddw %%xmm2, %%xmm0\n"
- " psraw %%xmm5, %%xmm0\n"
- " movdqu 0(%[s1]), %%xmm1\n"
- " paddw %%xmm0, %%xmm1\n"
- " movdqu %%xmm1, 0(%[d1])\n"
-
- " add $16, %[d1]\n"
- " add $16, %[s1]\n"
- " add $16, %[s2]\n"
- " add $16, %[s3]\n"
- " decl %[n]\n"
- " jnz 1b\n"
- : [d1] "+r" (d1), [s1] "+r" (s1), [s2] "+r" (s2),
- [s3] "+r" (s3), [n] "+r" (n)
- :
- );
-}
-OIL_DEFINE_IMPL_FULL (add2_rshift_add_s16_sse, add2_rshift_add_s16, OIL_IMPL_FLAG_MMXEXT|OIL_IMPL_FLAG_SSE|OIL_IMPL_FLAG_SSE2);
-
-void
-add2_rshift_sub_s16_sse (int16_t *d1, int16_t *s1, int16_t *s2, int16_t *s3,
- int16_t *s4_2, int n)
-{
- while (n&7) {
- int x;
-
- x = s4_2[0] + s2[0] + s3[0];
- x >>= s4_2[1];
- d1[0] = s1[0] - x;
-
- d1++;
- s1++;
- s2++;
- s3++;
- n--;
- }
- if (n==0) return;
-
- n>>=3;
- asm volatile ("\n"
- " movd 0(%[s4_2]), %%xmm4\n"
- " pshuflw $0x00, %%xmm4, %%xmm4\n"
- " pshufd $0x00, %%xmm4, %%xmm4\n"
- " movzwl 2(%[s4_2]), %%ecx\n"
- " movd %%ecx, %%xmm5\n"
- :: [s4_2] "r" (s4_2)
- : "ecx");
- asm volatile ("\n"
- "1:\n"
- " movdqu %%xmm4, %%xmm0\n"
- " movdqu 0(%[s2]), %%xmm1\n"
- " paddw %%xmm1, %%xmm0\n"
- " movdqu 0(%[s3]), %%xmm2\n"
- " paddw %%xmm2, %%xmm0\n"
- " psraw %%xmm5, %%xmm0\n"
- " movdqu 0(%[s1]), %%xmm1\n"
- " psubw %%xmm0, %%xmm1\n"
- " movdqu %%xmm1, 0(%[d1])\n"
-
- " add $16, %[d1]\n"
- " add $16, %[s1]\n"
- " add $16, %[s2]\n"
- " add $16, %[s3]\n"
- " decl %[n]\n"
- " jnz 1b\n"
- : [d1] "+r" (d1), [s1] "+r" (s1), [s2] "+r" (s2),
- [s3] "+r" (s3), [n] "+r" (n)
- :
- );
-}
-OIL_DEFINE_IMPL_FULL (add2_rshift_sub_s16_sse, add2_rshift_sub_s16, OIL_IMPL_FLAG_MMXEXT|OIL_IMPL_FLAG_SSE|OIL_IMPL_FLAG_SSE2);
-
diff --git a/liboil/i386_amd64/Makefile.am b/liboil/i386_amd64/Makefile.am
index b7349bb..b44dae1 100644
--- a/liboil/i386_amd64/Makefile.am
+++ b/liboil/i386_amd64/Makefile.am
@@ -2,12 +2,15 @@
noinst_LTLIBRARIES = libi386_amd64.la
libi386_amd64_la_SOURCES = \
+ add2.c \
addc.c \
clamp.c \
convert.c \
copy.c \
idct8x8_i386.c \
+ mas.c \
math.c \
+ multiply_and_acc.c \
mt19937.c \
resample.c \
sad8x8.c \
diff --git a/liboil/i386_amd64/add2.c b/liboil/i386_amd64/add2.c
new file mode 100644
index 0000000..7c4f3ab
--- /dev/null
+++ b/liboil/i386_amd64/add2.c
@@ -0,0 +1,488 @@
+
+#include <liboil/liboilfunction.h>
+#include <liboil/liboilclasses.h>
+
+
+
+
+void
+add2_rshift_add_s16_mmx (int16_t *d1, int16_t *s1, int16_t *s2, int16_t *s3,
+ int16_t *s4_2, int n)
+{
+ while (n&3) {
+ int x;
+
+ x = s4_2[0] + s2[0] + s3[0];
+ x >>= s4_2[1];
+ d1[0] = s1[0] + x;
+
+ d1++;
+ s1++;
+ s2++;
+ s3++;
+ n--;
+ }
+ if (n==0) return;
+
+ n>>=2;
+ asm volatile ("\n"
+ " movd 0(%[s4_2]), %%mm4\n"
+ " pshufw $0x00, %%mm4, %%mm4\n"
+ " movzwl 2(%[s4_2]), %%ecx\n"
+ " movd %%ecx, %%mm5\n"
+ :: [s4_2] "r" (s4_2)
+ : "ecx");
+ asm volatile ("\n"
+ "1:\n"
+ " movq %%mm4, %%mm0\n"
+ " paddw 0(%[s2]), %%mm0\n"
+ " paddw 0(%[s3]), %%mm0\n"
+ " psraw %%mm5, %%mm0\n"
+ " paddw 0(%[s1]), %%mm0\n"
+ " movq %%mm0, 0(%[d1])\n"
+
+ " add $8, %[d1]\n"
+ " add $8, %[s1]\n"
+ " add $8, %[s2]\n"
+ " add $8, %[s3]\n"
+ " decl %[n]\n"
+ " jnz 1b\n"
+ " emms\n"
+ : [d1] "+r" (d1), [s1] "+r" (s1), [s2] "+r" (s2),
+ [s3] "+r" (s3), [n] "+r" (n)
+ :
+ );
+}
+OIL_DEFINE_IMPL_FULL (add2_rshift_add_s16_mmx, add2_rshift_add_s16, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT);
+
+void
+add2_rshift_sub_s16_mmx (int16_t *d1, int16_t *s1, int16_t *s2, int16_t *s3,
+ int16_t *s4_2, int n)
+{
+ while (n&3) {
+ int x;
+
+ x = s4_2[0] + s2[0] + s3[0];
+ x >>= s4_2[1];
+ d1[0] = s1[0] - x;
+
+ d1++;
+ s1++;
+ s2++;
+ s3++;
+ n--;
+ }
+ if (n==0) return;
+
+ n>>=2;
+ asm volatile ("\n"
+ " movd 0(%[s4_2]), %%mm4\n"
+ " pshufw $0x00, %%mm4, %%mm4\n"
+ " movzwl 2(%[s4_2]), %%ecx\n"
+ " movd %%ecx, %%mm5\n"
+ :: [s4_2] "r" (s4_2)
+ : "ecx");
+ asm volatile ("\n"
+ "1:\n"
+ " movq %%mm4, %%mm0\n"
+ " paddw 0(%[s2]), %%mm0\n"
+ " paddw 0(%[s3]), %%mm0\n"
+ " psraw %%mm5, %%mm0\n"
+ " movq 0(%[s1]), %%mm1\n"
+ " psubw %%mm0, %%mm1\n"
+ " movq %%mm1, 0(%[d1])\n"
+
+ " add $8, %[d1]\n"
+ " add $8, %[s1]\n"
+ " add $8, %[s2]\n"
+ " add $8, %[s3]\n"
+ " decl %[n]\n"
+ " jnz 1b\n"
+ " emms\n"
+ : [d1] "+r" (d1), [s1] "+r" (s1), [s2] "+r" (s2),
+ [s3] "+r" (s3), [n] "+r" (n)
+ :
+ );
+}
+OIL_DEFINE_IMPL_FULL (add2_rshift_sub_s16_mmx, add2_rshift_sub_s16, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT);
+
+void
+add2_rshift_add_s16_mmx_unroll2 (int16_t *d1, int16_t *s1, int16_t *s2, int16_t *s3,
+ int16_t *s4_2, int n)
+{
+ while (n&7) {
+ int x;
+
+ x = s4_2[0] + s2[0] + s3[0];
+ x >>= s4_2[1];
+ d1[0] = s1[0] + x;
+
+ d1++;
+ s1++;
+ s2++;
+ s3++;
+ n--;
+ }
+ if (n==0) return;
+
+ n>>=3;
+ asm volatile ("\n"
+ " movd 0(%[s4_2]), %%mm4\n"
+ " pshufw $0x00, %%mm4, %%mm4\n"
+ " movzwl 2(%[s4_2]), %%ecx\n"
+ " movd %%ecx, %%mm5\n"
+ :: [s4_2] "r" (s4_2)
+ : "ecx");
+ asm volatile ("\n"
+ "1:\n"
+ " movq %%mm4, %%mm0\n"
+ " paddw 0(%[s2]), %%mm0\n"
+ " paddw 0(%[s3]), %%mm0\n"
+ " psraw %%mm5, %%mm0\n"
+ " paddw 0(%[s1]), %%mm0\n"
+ " movq %%mm0, 0(%[d1])\n"
+
+ " movq %%mm4, %%mm0\n"
+ " paddw 8(%[s2]), %%mm0\n"
+ " paddw 8(%[s3]), %%mm0\n"
+ " psraw %%mm5, %%mm0\n"
+ " paddw 8(%[s1]), %%mm0\n"
+ " movq %%mm0, 8(%[d1])\n"
+
+ " add $16, %[d1]\n"
+ " add $16, %[s1]\n"
+ " add $16, %[s2]\n"
+ " add $16, %[s3]\n"
+ " decl %[n]\n"
+ " jnz 1b\n"
+ " emms\n"
+ : [d1] "+r" (d1), [s1] "+r" (s1), [s2] "+r" (s2),
+ [s3] "+r" (s3), [n] "+r" (n)
+ :
+ );
+}
+OIL_DEFINE_IMPL_FULL (add2_rshift_add_s16_mmx_unroll2, add2_rshift_add_s16, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT);
+
+void
+add2_rshift_sub_s16_mmx_unroll2 (int16_t *d1, int16_t *s1, int16_t *s2, int16_t *s3,
+ int16_t *s4_2, int n)
+{
+ while (n&7) {
+ int x;
+
+ x = s4_2[0] + s2[0] + s3[0];
+ x >>= s4_2[1];
+ d1[0] = s1[0] - x;
+
+ d1++;
+ s1++;
+ s2++;
+ s3++;
+ n--;
+ }
+ if (n==0) return;
+
+ n>>=3;
+ asm volatile ("\n"
+ " movd 0(%[s4_2]), %%mm4\n"
+ " pshufw $0x00, %%mm4, %%mm4\n"
+ " movzwl 2(%[s4_2]), %%ecx\n"
+ " movd %%ecx, %%mm5\n"
+ :: [s4_2] "r" (s4_2)
+ : "ecx");
+ asm volatile ("\n"
+ "1:\n"
+ " movq %%mm4, %%mm0\n"
+ " paddw 0(%[s2]), %%mm0\n"
+ " paddw 0(%[s3]), %%mm0\n"
+ " psraw %%mm5, %%mm0\n"
+ " movq 0(%[s1]), %%mm1\n"
+ " psubw %%mm0, %%mm1\n"
+ " movq %%mm1, 0(%[d1])\n"
+
+ " movq %%mm4, %%mm0\n"
+ " paddw 8(%[s2]), %%mm0\n"
+ " paddw 8(%[s3]), %%mm0\n"
+ " psraw %%mm5, %%mm0\n"
+ " movq 8(%[s1]), %%mm1\n"
+ " psubw %%mm0, %%mm1\n"
+ " movq %%mm1, 8(%[d1])\n"
+
+ " add $16, %[d1]\n"
+ " add $16, %[s1]\n"
+ " add $16, %[s2]\n"
+ " add $16, %[s3]\n"
+ " decl %[n]\n"
+ " jnz 1b\n"
+ " emms\n"
+ : [d1] "+r" (d1), [s1] "+r" (s1), [s2] "+r" (s2),
+ [s3] "+r" (s3), [n] "+r" (n)
+ :
+ );
+}
+OIL_DEFINE_IMPL_FULL (add2_rshift_sub_s16_mmx_unroll2, add2_rshift_sub_s16, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT);
+
+void
+add2_rshift_add_s16_mmx_unroll4 (int16_t *d1, int16_t *s1, int16_t *s2, int16_t *s3,
+ int16_t *s4_2, int n)
+{
+ while (n&15) {
+ int x;
+
+ x = s4_2[0] + s2[0] + s3[0];
+ x >>= s4_2[1];
+ d1[0] = s1[0] + x;
+
+ d1++;
+ s1++;
+ s2++;
+ s3++;
+ n--;
+ }
+ if (n==0) return;
+
+ n>>=4;
+ asm volatile ("\n"
+ " movd 0(%[s4_2]), %%mm4\n"
+ " pshufw $0x00, %%mm4, %%mm4\n"
+ " movzwl 2(%[s4_2]), %%ecx\n"
+ " movd %%ecx, %%mm5\n"
+ :: [s4_2] "r" (s4_2)
+ : "ecx");
+ asm volatile ("\n"
+ "1:\n"
+ " movq %%mm4, %%mm0\n"
+ " paddw 0(%[s2]), %%mm0\n"
+ " paddw 0(%[s3]), %%mm0\n"
+ " psraw %%mm5, %%mm0\n"
+ " paddw 0(%[s1]), %%mm0\n"
+ " movq %%mm0, 0(%[d1])\n"
+
+ " movq %%mm4, %%mm0\n"
+ " paddw 8(%[s2]), %%mm0\n"
+ " paddw 8(%[s3]), %%mm0\n"
+ " psraw %%mm5, %%mm0\n"
+ " paddw 8(%[s1]), %%mm0\n"
+ " movq %%mm0, 8(%[d1])\n"
+
+ " movq %%mm4, %%mm0\n"
+ " paddw 16(%[s2]), %%mm0\n"
+ " paddw 16(%[s3]), %%mm0\n"
+ " psraw %%mm5, %%mm0\n"
+ " paddw 16(%[s1]), %%mm0\n"
+ " movq %%mm0, 16(%[d1])\n"
+
+ " movq %%mm4, %%mm0\n"
+ " paddw 24(%[s2]), %%mm0\n"
+ " paddw 24(%[s3]), %%mm0\n"
+ " psraw %%mm5, %%mm0\n"
+ " paddw 24(%[s1]), %%mm0\n"
+ " movq %%mm0, 24(%[d1])\n"
+
+ " add $32, %[d1]\n"
+ " add $32, %[s1]\n"
+ " add $32, %[s2]\n"
+ " add $32, %[s3]\n"
+ " decl %[n]\n"
+ " jnz 1b\n"
+ " emms\n"
+ : [d1] "+r" (d1), [s1] "+r" (s1), [s2] "+r" (s2),
+ [s3] "+r" (s3), [n] "+r" (n)
+ :
+ );
+}
+OIL_DEFINE_IMPL_FULL (add2_rshift_add_s16_mmx_unroll4, add2_rshift_add_s16, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT);
+
+void
+add2_rshift_sub_s16_mmx_unroll4 (int16_t *d1, int16_t *s1, int16_t *s2, int16_t *s3,
+ int16_t *s4_2, int n)
+{
+ while (n&15) {
+ int x;
+
+ x = s4_2[0] + s2[0] + s3[0];
+ x >>= s4_2[1];
+ d1[0] = s1[0] - x;
+
+ d1++;
+ s1++;
+ s2++;
+ s3++;
+ n--;
+ }
+ if (n==0) return;
+
+ n>>=4;
+ asm volatile ("\n"
+ " movd 0(%[s4_2]), %%mm4\n"
+ " pshufw $0x00, %%mm4, %%mm4\n"
+ " movzwl 2(%[s4_2]), %%ecx\n"
+ " movd %%ecx, %%mm5\n"
+ :: [s4_2] "r" (s4_2)
+ : "ecx");
+ asm volatile ("\n"
+ "1:\n"
+ " movq %%mm4, %%mm0\n"
+ " paddw 0(%[s2]), %%mm0\n"
+ " paddw 0(%[s3]), %%mm0\n"
+ " psraw %%mm5, %%mm0\n"
+ " movq 0(%[s1]), %%mm1\n"
+ " psubw %%mm0, %%mm1\n"
+ " movq %%mm1, 0(%[d1])\n"
+
+ " movq %%mm4, %%mm2\n"
+ " paddw 8(%[s2]), %%mm2\n"
+ " paddw 8(%[s3]), %%mm2\n"
+ " psraw %%mm5, %%mm2\n"
+ " movq 8(%[s1]), %%mm3\n"
+ " psubw %%mm2, %%mm3\n"
+ " movq %%mm3, 8(%[d1])\n"
+
+ " movq %%mm4, %%mm0\n"
+ " paddw 16(%[s2]), %%mm0\n"
+ " paddw 16(%[s3]), %%mm0\n"
+ " psraw %%mm5, %%mm0\n"
+ " movq 16(%[s1]), %%mm1\n"
+ " psubw %%mm0, %%mm1\n"
+ " movq %%mm1, 16(%[d1])\n"
+
+ " movq %%mm4, %%mm2\n"
+ " paddw 24(%[s2]), %%mm2\n"
+ " paddw 24(%[s3]), %%mm2\n"
+ " psraw %%mm5, %%mm2\n"
+ " movq 24(%[s1]), %%mm3\n"
+ " psubw %%mm2, %%mm3\n"
+ " movq %%mm3, 24(%[d1])\n"
+
+ " add $32, %[d1]\n"
+ " add $32, %[s1]\n"
+ " add $32, %[s2]\n"
+ " add $32, %[s3]\n"
+ " decl %[n]\n"
+ " jnz 1b\n"
+ " emms\n"
+ : [d1] "+r" (d1), [s1] "+r" (s1), [s2] "+r" (s2),
+ [s3] "+r" (s3), [n] "+r" (n)
+ :
+ );
+}
+OIL_DEFINE_IMPL_FULL (add2_rshift_sub_s16_mmx_unroll4, add2_rshift_sub_s16, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT);
+
+
+void
+add2_rshift_add_s16_sse (int16_t *d1, int16_t *s1, int16_t *s2, int16_t *s3,
+ int16_t *s4_2, int n)
+{
+ while (n&7) {
+ int x;
+
+ x = s4_2[0] + s2[0] + s3[0];
+ x >>= s4_2[1];
+ d1[0] = s1[0] + x;
+
+ d1++;
+ s1++;
+ s2++;
+ s3++;
+ n--;
+ }
+ if (n==0) return;
+
+ n>>=3;
+ asm volatile ("\n"
+ " movd 0(%[s4_2]), %%xmm4\n"
+ " pshuflw $0x00, %%xmm4, %%xmm4\n"
+ " pshufd $0x00, %%xmm4, %%xmm4\n"
+ " movzwl 2(%[s4_2]), %%ecx\n"
+ " movd %%ecx, %%xmm5\n"
+ :: [s4_2] "r" (s4_2)
+ : "ecx");
+ asm volatile ("\n"
+ "1:\n"
+#if 0
+ " movdqu %%xmm4, %%xmm0\n"
+ " movdqu 0(%[s2]), %%xmm1\n"
+ " paddw %%xmm1, %%xmm0\n"
+ " movdqu 0(%[s3]), %%xmm2\n"
+ " paddw %%xmm2, %%xmm0\n"
+ " psraw %%xmm5, %%xmm0\n"
+ " movdqu 0(%[d1]), %%xmm1\n"
+ " paddw %%xmm1, %%xmm0\n"
+ " movdqu %%xmm0, 0(%[d1])\n"
+#endif
+ " movdqu %%xmm4, %%xmm0\n"
+ " movdqu 0(%[s2]), %%xmm1\n"
+ " paddw %%xmm1, %%xmm0\n"
+ " movdqu 0(%[s3]), %%xmm2\n"
+ " paddw %%xmm2, %%xmm0\n"
+ " psraw %%xmm5, %%xmm0\n"
+ " movdqu 0(%[s1]), %%xmm1\n"
+ " paddw %%xmm0, %%xmm1\n"
+ " movdqu %%xmm1, 0(%[d1])\n"
+
+ " add $16, %[d1]\n"
+ " add $16, %[s1]\n"
+ " add $16, %[s2]\n"
+ " add $16, %[s3]\n"
+ " decl %[n]\n"
+ " jnz 1b\n"
+ : [d1] "+r" (d1), [s1] "+r" (s1), [s2] "+r" (s2),
+ [s3] "+r" (s3), [n] "+r" (n)
+ :
+ );
+}
+OIL_DEFINE_IMPL_FULL (add2_rshift_add_s16_sse, add2_rshift_add_s16, OIL_IMPL_FLAG_MMXEXT|OIL_IMPL_FLAG_SSE|OIL_IMPL_FLAG_SSE2);
+
+void
+add2_rshift_sub_s16_sse (int16_t *d1, int16_t *s1, int16_t *s2, int16_t *s3,
+ int16_t *s4_2, int n)
+{
+ while (n&7) {
+ int x;
+
+ x = s4_2[0] + s2[0] + s3[0];
+ x >>= s4_2[1];
+ d1[0] = s1[0] - x;
+
+ d1++;
+ s1++;
+ s2++;
+ s3++;
+ n--;
+ }
+ if (n==0) return;
+
+ n>>=3;
+ asm volatile ("\n"
+ " movd 0(%[s4_2]), %%xmm4\n"
+ " pshuflw $0x00, %%xmm4, %%xmm4\n"
+ " pshufd $0x00, %%xmm4, %%xmm4\n"
+ " movzwl 2(%[s4_2]), %%ecx\n"
+ " movd %%ecx, %%xmm5\n"
+ :: [s4_2] "r" (s4_2)
+ : "ecx");
+ asm volatile ("\n"
+ "1:\n"
+ " movdqu %%xmm4, %%xmm0\n"
+ " movdqu 0(%[s2]), %%xmm1\n"
+ " paddw %%xmm1, %%xmm0\n"
+ " movdqu 0(%[s3]), %%xmm2\n"
+ " paddw %%xmm2, %%xmm0\n"
+ " psraw %%xmm5, %%xmm0\n"
+ " movdqu 0(%[s1]), %%xmm1\n"
+ " psubw %%xmm0, %%xmm1\n"
+ " movdqu %%xmm1, 0(%[d1])\n"
+
+ " add $16, %[d1]\n"
+ " add $16, %[s1]\n"
+ " add $16, %[s2]\n"
+ " add $16, %[s3]\n"
+ " decl %[n]\n"
+ " jnz 1b\n"
+ : [d1] "+r" (d1), [s1] "+r" (s1), [s2] "+r" (s2),
+ [s3] "+r" (s3), [n] "+r" (n)
+ :
+ );
+}
+OIL_DEFINE_IMPL_FULL (add2_rshift_sub_s16_sse, add2_rshift_sub_s16, OIL_IMPL_FLAG_MMXEXT|OIL_IMPL_FLAG_SSE|OIL_IMPL_FLAG_SSE2);
+
diff --git a/liboil/i386_amd64/mas.c b/liboil/i386_amd64/mas.c
new file mode 100644
index 0000000..20d4d2f
--- /dev/null
+++ b/liboil/i386_amd64/mas.c
@@ -0,0 +1,1114 @@
+
+#include <liboil/liboilfunction.h>
+#include <liboil/liboilclasses.h>
+
+
+void
+mas10_u8_mmx (uint8_t *d, const uint8_t *s1_np9, const int16_t *s2_10,
+ const int16_t *s3_2, int n)
+{
+ int j;
+ int x;
+
+ while(n&3) {
+ x = 0;
+ for(j=0;j<10;j++){
+ x += s1_np9[j] * s2_10[j];
+ }
+ *d = CLAMP((x + s3_2[0])>>s3_2[1],0,255);
+ d++;
+ s1_np9++;
+ n--;
+ }
+
+ if (n == 0) return;
+ n>>=2;
+ __asm__ __volatile__("\n"
+ " pxor %%mm7, %%mm7\n"
+
+ " movd (%[s3_2]), %%mm6\n"
+
+ " movzwl 2(%[s3_2]), %%ecx\n"
+ " movd %%ecx, %%mm5\n"
+
+ "1:\n"
+ /* load 128 */
+ " pshufw $0x00, %%mm6, %%mm2\n"
+
+#define LOOP(x) \
+ " movd " #x "(%[s1_np9]), %%mm0\n" \
+ " punpcklbw %%mm7, %%mm0\n" \
+ " movq 2*" #x "(%[s2_10]), %%mm1\n" \
+ " pshufw $0x00, %%mm1, %%mm1\n" \
+ " pmullw %%mm1, %%mm0\n" \
+ " paddw %%mm0, %%mm2\n"
+
+ LOOP(0)
+ LOOP(1)
+ LOOP(2)
+ LOOP(3)
+ LOOP(4)
+ LOOP(5)
+ LOOP(6)
+ LOOP(7)
+ LOOP(8)
+ LOOP(9)
+#undef LOOP
+
+ " psraw %%mm5, %%mm2\n"
+ " pmaxsw %%mm7, %%mm2\n"
+ " packuswb %%mm2, %%mm2\n"
+ " movd %%mm2, 0(%[d])\n"
+ " add $4, %[d]\n"
+ " add $4, %[s1_np9]\n"
+ " decl %[n]\n"
+ " jnz 1b\n"
+ " emms\n"
+ : [d] "+r" (d),
+ [s1_np9] "+r" (s1_np9),
+ [n] "+m" (n)
+ : [s2_10] "r" (s2_10),
+ [s3_2] "r" (s3_2)
+ : "ecx");
+}
+OIL_DEFINE_IMPL_FULL (mas10_u8_mmx, mas10_u8_l15, OIL_IMPL_FLAG_MMX|OIL_IMPL_FLAG_MMXEXT);
+
+void
+mas10_u8_mmx_2 (uint8_t *d, const uint8_t *s1_np9, const int16_t *s2_10,
+ const int16_t *s3_2, int n)
+{
+ int j;
+ int x;
+ int16_t coeff[4*10];
+ int16_t *ptr;
+
+ ptr = coeff;
+
+ while(n&3) {
+ x = 0;
+ for(j=0;j<10;j++){
+ x += s1_np9[j] * s2_10[j];
+ }
+ *d = CLAMP((x + s3_2[0])>>s3_2[1],0,255);
+ d++;
+ s1_np9++;
+ n--;
+ }
+
+ for(j=0;j<10;j++){
+ ptr[4*j + 0] = s2_10[j];
+ ptr[4*j + 1] = s2_10[j];
+ ptr[4*j + 2] = s2_10[j];
+ ptr[4*j + 3] = s2_10[j];
+ }
+
+ if (n == 0) return;
+ n>>=2;
+ __asm__ __volatile__("\n"
+ " pxor %%mm7, %%mm7\n"
+
+ " movd (%[s3_2]), %%mm6\n"
+
+ " movzwl 2(%[s3_2]), %%ecx\n"
+ " movd %%ecx, %%mm5\n"
+
+ "1:\n"
+ /* load 128 */
+ " pshufw $0x00, %%mm6, %%mm2\n"
+
+#define LOOP(x) \
+ " movd " #x "(%[s1_np9]), %%mm0\n" \
+ " punpcklbw %%mm7, %%mm0\n" \
+ " pmullw 8*" #x "(%[coeff]), %%mm0\n" \
+ " paddw %%mm0, %%mm2\n"
+
+ LOOP(0)
+ LOOP(1)
+ LOOP(2)
+ LOOP(3)
+ LOOP(4)
+ LOOP(5)
+ LOOP(6)
+ LOOP(7)
+ LOOP(8)
+ LOOP(9)
+#undef LOOP
+
+ " psraw %%mm5, %%mm2\n"
+ " pmaxsw %%mm7, %%mm2\n"
+ " packuswb %%mm2, %%mm2\n"
+ " movd %%mm2, 0(%[d])\n"
+ " add $4, %[d]\n"
+ " add $4, %[s1_np9]\n"
+ " decl %[n]\n"
+ " jnz 1b\n"
+ " emms\n"
+ : [d] "+r" (d),
+ [s1_np9] "+r" (s1_np9),
+ [n] "+m" (n)
+ : [coeff] "r" (ptr),
+ [s3_2] "r" (s3_2)
+ : "ecx");
+}
+OIL_DEFINE_IMPL_FULL (mas10_u8_mmx_2, mas10_u8_l15, OIL_IMPL_FLAG_MMX|OIL_IMPL_FLAG_MMXEXT);
+
+void
+mas10_u8_mmx_3 (uint8_t *d, const uint8_t *s1_np9, const int16_t *s2_10,
+ const int16_t *s3_2, int n)
+{
+ int j;
+ int x;
+
+ while(n&3) {
+ x = 0;
+ for(j=0;j<10;j++){
+ x += s1_np9[j] * s2_10[j];
+ }
+ *d = CLAMP((x + s3_2[0])>>s3_2[1],0,255);
+ d++;
+ s1_np9++;
+ n--;
+ }
+
+ if (n == 0) return;
+ n>>=2;
+ __asm__ __volatile__("\n"
+ " pxor %%mm7, %%mm7\n"
+
+ " movd (%[s3_2]), %%mm6\n"
+
+ " movzwl 2(%[s3_2]), %%ecx\n"
+ " movd %%ecx, %%mm5\n"
+
+ " movq 0(%[s2_10]), %%mm3\n"
+ " movq 8(%[s2_10]), %%mm4\n"
+
+ "1:\n"
+ /* load 128 */
+ " pshufw $0x00, %%mm6, %%mm2\n"
+
+#define LOOP(x) \
+ " movd " #x "(%[s1_np9]), %%mm0\n" \
+ " punpcklbw %%mm7, %%mm0\n" \
+ " movq 2*" #x "(%[s2_10]), %%mm1\n" \
+ " pshufw $0x00, %%mm1, %%mm1\n" \
+ " pmullw %%mm1, %%mm0\n" \
+ " paddw %%mm0, %%mm2\n"
+
+ //LOOP(0)
+ " movd 0(%[s1_np9]), %%mm0\n"
+ " punpcklbw %%mm7, %%mm0\n"
+ " pshufw $0x00, %%mm3, %%mm1\n"
+ " pmullw %%mm1, %%mm0\n"
+ " paddw %%mm0, %%mm2\n"
+
+ //LOOP(1)
+ " movd 1(%[s1_np9]), %%mm0\n"
+ " punpcklbw %%mm7, %%mm0\n"
+ " pshufw $0x55*1, %%mm3, %%mm1\n"
+ " pmullw %%mm1, %%mm0\n"
+ " paddw %%mm0, %%mm2\n"
+
+ //LOOP(2)
+ " movd 2(%[s1_np9]), %%mm0\n"
+ " punpcklbw %%mm7, %%mm0\n"
+ " pshufw $0x55*2, %%mm3, %%mm1\n"
+ " pmullw %%mm1, %%mm0\n"
+ " paddw %%mm0, %%mm2\n"
+
+ //LOOP(3)
+ " movd 3(%[s1_np9]), %%mm0\n"
+ " punpcklbw %%mm7, %%mm0\n"
+ " pshufw $0x55*3, %%mm3, %%mm1\n"
+ " pmullw %%mm1, %%mm0\n"
+ " paddw %%mm0, %%mm2\n"
+
+ //LOOP(4)
+ " movd 4(%[s1_np9]), %%mm0\n"
+ " punpcklbw %%mm7, %%mm0\n"
+ " pshufw $0x00, %%mm4, %%mm1\n"
+ " pmullw %%mm1, %%mm0\n"
+ " paddw %%mm0, %%mm2\n"
+
+ //LOOP(5)
+ " movd 5(%[s1_np9]), %%mm0\n"
+ " punpcklbw %%mm7, %%mm0\n"
+ " pshufw $0x55*1, %%mm4, %%mm1\n"
+ " pmullw %%mm1, %%mm0\n"
+ " paddw %%mm0, %%mm2\n"
+
+ //LOOP(6)
+ " movd 6(%[s1_np9]), %%mm0\n"
+ " punpcklbw %%mm7, %%mm0\n"
+ " pshufw $0x55*2, %%mm4, %%mm1\n"
+ " pmullw %%mm1, %%mm0\n"
+ " paddw %%mm0, %%mm2\n"
+
+ //LOOP(7)
+ " movd 7(%[s1_np9]), %%mm0\n"
+ " punpcklbw %%mm7, %%mm0\n"
+ " pshufw $0x55*3, %%mm4, %%mm1\n"
+ " pmullw %%mm1, %%mm0\n"
+ " paddw %%mm0, %%mm2\n"
+
+ LOOP(8)
+ LOOP(9)
+#undef LOOP
+
+ " psraw %%mm5, %%mm2\n"
+ " pmaxsw %%mm7, %%mm2\n"
+ " packuswb %%mm2, %%mm2\n"
+ " movd %%mm2, 0(%[d])\n"
+ " add $4, %[d]\n"
+ " add $4, %[s1_np9]\n"
+ " decl %[n]\n"
+ " jnz 1b\n"
+ " emms\n"
+ : [d] "+r" (d),
+ [s1_np9] "+r" (s1_np9),
+ [n] "+m" (n)
+ : [s2_10] "r" (s2_10),
+ [s3_2] "r" (s3_2)
+ : "ecx");
+}
+OIL_DEFINE_IMPL_FULL (mas10_u8_mmx_3, mas10_u8_l15, OIL_IMPL_FLAG_MMX|OIL_IMPL_FLAG_MMXEXT);
+
+void
+mas10_u8_mmx_4 (uint8_t *d, const uint8_t *s1_np9, const int16_t *s2_10,
+ const int16_t *s3_2, int n)
+{
+ if (n == 0) return;
+ __asm__ __volatile__("\n"
+ " pxor %%mm7, %%mm7\n"
+
+ " movzwl 0(%[s3_2]), %%ecx\n"
+ " movd %%ecx, %%mm6\n"
+
+ " movzwl 2(%[s3_2]), %%ecx\n"
+ " movd %%ecx, %%mm5\n"
+
+ "1:\n"
+ " movd 0(%[s1_np9]), %%mm0\n"
+ " punpcklbw %%mm7, %%mm0\n"
+ " pmaddwd 0(%[s2_10]), %%mm0\n"
+
+ " movd 4(%[s1_np9]), %%mm1\n"
+ " punpcklbw %%mm7, %%mm1\n"
+ " pmaddwd 8(%[s2_10]), %%mm1\n"
+
+ " movd 8(%[s1_np9]), %%mm2\n"
+ " punpcklbw %%mm7, %%mm2\n"
+ " pmaddwd 16(%[s2_10]), %%mm2\n"
+
+ " paddd %%mm1, %%mm0\n"
+ " movq %%mm0, %%mm1\n"
+ " psrlq $32, %%mm0\n"
+ " paddd %%mm1, %%mm0\n"
+ " paddd %%mm2, %%mm0\n"
+ " paddd %%mm6, %%mm0\n"
+
+ " psrad %%mm5, %%mm0\n"
+ " pmaxsw %%mm7, %%mm0\n"
+ " packuswb %%mm0, %%mm0\n"
+ " movd %%mm0, %%ecx\n"
+ " movb %%cl,0(%[d])\n"
+
+ " add $1, %[d]\n"
+ " add $1, %[s1_np9]\n"
+ " decl %[n]\n"
+ " jnz 1b\n"
+ " emms\n"
+ : [d] "+r" (d),
+ [s1_np9] "+r" (s1_np9),
+ [n] "+m" (n)
+ : [s2_10] "r" (s2_10),
+ [s3_2] "r" (s3_2)
+ : "ecx");
+}
+OIL_DEFINE_IMPL_FULL (mas10_u8_mmx_4, mas10_u8, OIL_IMPL_FLAG_MMX|OIL_IMPL_FLAG_MMXEXT);
+
+
+void
+mas8_u8_mmx_3 (uint8_t *d, const uint8_t *s1_np7, const int16_t *s2_8,
+ const int16_t *s3_2, int n)
+{
+ int j;
+ int x;
+
+ while(n&3) {
+ x = 0;
+ for(j=0;j<8;j++){
+ x += s1_np7[j] * s2_8[j];
+ }
+ *d = CLAMP((x + s3_2[0])>>s3_2[1],0,255);
+ d++;
+ s1_np7++;
+ n--;
+ }
+
+ if (n == 0) return;
+ n>>=2;
+ __asm__ __volatile__("\n"
+ " pxor %%mm7, %%mm7\n"
+
+ " movd (%[s3_2]), %%mm6\n"
+
+ " movzwl 2(%[s3_2]), %%ecx\n"
+ " movd %%ecx, %%mm5\n"
+
+ " movq 0(%[s2_8]), %%mm3\n"
+ " movq 8(%[s2_8]), %%mm4\n"
+
+ "1:\n"
+ /* load 128 */
+ " pshufw $0x00, %%mm6, %%mm2\n"
+
+ " movd 0(%[s1_np7]), %%mm0\n"
+ " punpcklbw %%mm7, %%mm0\n"
+ " pshufw $0x00, %%mm3, %%mm1\n"
+ " pmullw %%mm1, %%mm0\n"
+ " paddw %%mm0, %%mm2\n"
+
+ " movd 1(%[s1_np7]), %%mm0\n"
+ " punpcklbw %%mm7, %%mm0\n"
+ " pshufw $0x55*1, %%mm3, %%mm1\n"
+ " pmullw %%mm1, %%mm0\n"
+ " paddw %%mm0, %%mm2\n"
+
+ " movd 2(%[s1_np7]), %%mm0\n"
+ " punpcklbw %%mm7, %%mm0\n"
+ " pshufw $0x55*2, %%mm3, %%mm1\n"
+ " pmullw %%mm1, %%mm0\n"
+ " paddw %%mm0, %%mm2\n"
+
+ " movd 3(%[s1_np7]), %%mm0\n"
+ " punpcklbw %%mm7, %%mm0\n"
+ " pshufw $0x55*3, %%mm3, %%mm1\n"
+ " pmullw %%mm1, %%mm0\n"
+ " paddw %%mm0, %%mm2\n"
+
+ " movd 4(%[s1_np7]), %%mm0\n"
+ " punpcklbw %%mm7, %%mm0\n"
+ " pshufw $0x00, %%mm4, %%mm1\n"
+ " pmullw %%mm1, %%mm0\n"
+ " paddw %%mm0, %%mm2\n"
+
+ " movd 5(%[s1_np7]), %%mm0\n"
+ " punpcklbw %%mm7, %%mm0\n"
+ " pshufw $0x55*1, %%mm4, %%mm1\n"
+ " pmullw %%mm1, %%mm0\n"
+ " paddw %%mm0, %%mm2\n"
+
+ " movd 6(%[s1_np7]), %%mm0\n"
+ " punpcklbw %%mm7, %%mm0\n"
+ " pshufw $0x55*2, %%mm4, %%mm1\n"
+ " pmullw %%mm1, %%mm0\n"
+ " paddw %%mm0, %%mm2\n"
+
+ " movd 7(%[s1_np7]), %%mm0\n"
+ " punpcklbw %%mm7, %%mm0\n"
+ " pshufw $0x55*3, %%mm4, %%mm1\n"
+ " pmullw %%mm1, %%mm0\n"
+ " paddw %%mm0, %%mm2\n"
+
+ " psraw %%mm5, %%mm2\n"
+ " pmaxsw %%mm7, %%mm2\n"
+ " packuswb %%mm2, %%mm2\n"
+ " movd %%mm2, 0(%[d])\n"
+ " add $4, %[d]\n"
+ " add $4, %[s1_np7]\n"
+ " decl %[n]\n"
+ " jnz 1b\n"
+ " emms\n"
+ : [d] "+r" (d),
+ [s1_np7] "+r" (s1_np7),
+ [n] "+m" (n)
+ : [s2_8] "r" (s2_8),
+ [s3_2] "r" (s3_2)
+ : "ecx");
+}
+OIL_DEFINE_IMPL_FULL (mas8_u8_mmx_3, mas8_u8_l15, OIL_IMPL_FLAG_MMX|OIL_IMPL_FLAG_MMXEXT);
+
+void
+mas8_u8_mmx_4 (uint8_t *d, const uint8_t *s1_np7, const int16_t *s2_8,
+ const int16_t *s3_2, int n)
+{
+ if (n == 0) return;
+ __asm__ __volatile__("\n"
+ " pxor %%mm7, %%mm7\n"
+
+ " movzwl 0(%[s3_2]), %%ecx\n"
+ " movd %%ecx, %%mm6\n"
+
+ " movzwl 2(%[s3_2]), %%ecx\n"
+ " movd %%ecx, %%mm5\n"
+
+ "1:\n"
+ " movd 0(%[s1_np7]), %%mm0\n"
+ " punpcklbw %%mm7, %%mm0\n"
+ " pmaddwd 0(%[s2_8]), %%mm0\n"
+
+ " movd 4(%[s1_np7]), %%mm1\n"
+ " punpcklbw %%mm7, %%mm1\n"
+ " pmaddwd 8(%[s2_8]), %%mm1\n"
+
+ " paddd %%mm1, %%mm0\n"
+ " movq %%mm0, %%mm1\n"
+ " psrlq $32, %%mm0\n"
+ " paddd %%mm1, %%mm0\n"
+ " paddd %%mm6, %%mm0\n"
+
+ " psrad %%mm5, %%mm0\n"
+ " pmaxsw %%mm7, %%mm0\n"
+ " packuswb %%mm0, %%mm0\n"
+ " movd %%mm0, %%ecx\n"
+ " movb %%cl,0(%[d])\n"
+
+ " add $1, %[d]\n"
+ " add $1, %[s1_np7]\n"
+ " decl %[n]\n"
+ " jnz 1b\n"
+ " emms\n"
+ : [d] "+r" (d),
+ [s1_np7] "+r" (s1_np7),
+ [n] "+m" (n)
+ : [s2_8] "r" (s2_8),
+ [s3_2] "r" (s3_2)
+ : "ecx");
+}
+OIL_DEFINE_IMPL_FULL (mas8_u8_mmx_4, mas8_u8, OIL_IMPL_FLAG_MMX|OIL_IMPL_FLAG_MMXEXT);
+
+void
+mas8_u8_sym_mmx_3 (uint8_t *d, const uint8_t *s1_np7, const int16_t *s2_8,
+ const int16_t *s3_2, int n)
+{
+ int j;
+ int x;
+
+ while(n&3) {
+ x = 0;
+ for(j=0;j<8;j++){
+ x += s1_np7[j] * s2_8[j];
+ }
+ *d = CLAMP((x + s3_2[0])>>s3_2[1],0,255);
+ d++;
+ s1_np7++;
+ n--;
+ }
+
+ if (n == 0) return;
+ n>>=2;
+ __asm__ __volatile__("\n"
+ " pxor %%mm7, %%mm7\n"
+
+ " movd (%[s3_2]), %%mm6\n"
+
+ " movzwl 2(%[s3_2]), %%ecx\n"
+ " movd %%ecx, %%mm5\n"
+
+ " movq 0(%[s2_8]), %%mm3\n"
+ " movq 8(%[s2_8]), %%mm4\n"
+
+ " .p2align 4,,15 \n"
+ "1:\n"
+ /* load 128 */
+ " pshufw $0x00, %%mm6, %%mm2\n"
+
+ " movd 0(%[s1_np7]), %%mm0\n"
+ " punpcklbw %%mm7, %%mm0\n"
+ " movd 7(%[s1_np7]), %%mm1\n"
+ " punpcklbw %%mm7, %%mm1\n"
+ " paddw %%mm1, %%mm0\n"
+ //" pshufw $0x00, %%mm3, %%mm1\n"
+ //" pmullw %%mm1, %%mm0\n"
+ //" paddw %%mm0, %%mm2\n"
+ " psubw %%mm0, %%mm2\n"
+
+ " movd 1(%[s1_np7]), %%mm0\n"
+ " punpcklbw %%mm7, %%mm0\n"
+ " movd 6(%[s1_np7]), %%mm1\n"
+ " punpcklbw %%mm7, %%mm1\n"
+ " paddw %%mm1, %%mm0\n"
+ " pshufw $0x55*1, %%mm3, %%mm1\n"
+ " pmullw %%mm1, %%mm0\n"
+ " paddw %%mm0, %%mm2\n"
+
+ " movd 2(%[s1_np7]), %%mm0\n"
+ " punpcklbw %%mm7, %%mm0\n"
+ " movd 5(%[s1_np7]), %%mm1\n"
+ " punpcklbw %%mm7, %%mm1\n"
+ " paddw %%mm1, %%mm0\n"
+ " pshufw $0x55*2, %%mm3, %%mm1\n"
+ " pmullw %%mm1, %%mm0\n"
+ " paddw %%mm0, %%mm2\n"
+
+ " movd 3(%[s1_np7]), %%mm0\n"
+ " punpcklbw %%mm7, %%mm0\n"
+ " movd 4(%[s1_np7]), %%mm1\n"
+ " punpcklbw %%mm7, %%mm1\n"
+ " paddw %%mm1, %%mm0\n"
+ " pshufw $0x55*3, %%mm3, %%mm1\n"
+ " pmullw %%mm1, %%mm0\n"
+ " paddw %%mm0, %%mm2\n"
+
+ " psraw %%mm5, %%mm2\n"
+ " pmaxsw %%mm7, %%mm2\n"
+ " packuswb %%mm2, %%mm2\n"
+ " movd %%mm2, 0(%[d])\n"
+ " add $4, %[d]\n"
+ " add $4, %[s1_np7]\n"
+ " decl %[n]\n"
+ " jnz 1b\n"
+ " emms\n"
+ : [d] "+r" (d),
+ [s1_np7] "+r" (s1_np7),
+ [n] "+m" (n)
+ : [s2_8] "r" (s2_8),
+ [s3_2] "r" (s3_2)
+ : "ecx");
+}
+OIL_DEFINE_IMPL_FULL (mas8_u8_sym_mmx_3, mas8_u8_sym_l15, OIL_IMPL_FLAG_MMX|OIL_IMPL_FLAG_MMXEXT);
+
+void
+mas8_u8_sym_mmx_41 (uint8_t *d, const uint8_t *s1_np7, const int16_t *s2_8,
+ const int16_t *s3_2, int n)
+{
+ int j;
+ int x;
+ int16_t tmp[16];
+
+ while(n&3) {
+ x = 0;
+ for(j=0;j<8;j++){
+ x += s1_np7[j] * s2_8[j];
+ }
+ *d = CLAMP((x + s3_2[0])>>s3_2[1],0,255);
+ d++;
+ s1_np7++;
+ n--;
+ }
+
+ if (n == 0) return;
+ n>>=2;
+ __asm__ __volatile__("\n"
+ " pxor %%mm7, %%mm7\n"
+
+ " movd (%[s3_2]), %%mm6\n"
+
+ " movzwl 2(%[s3_2]), %%ecx\n"
+ " movd %%ecx, %%mm5\n"
+
+ " movq 0(%[s2_8]), %%mm3\n"
+ " pshufw $0x55*0, %%mm3, %%mm1\n"
+ " movq %%mm1, 0(%[coeff])\n"
+ " pshufw $0x55*1, %%mm3, %%mm1\n"
+ " movq %%mm1, 8(%[coeff])\n"
+ " pshufw $0x55*2, %%mm3, %%mm1\n"
+ " movq %%mm1, 16(%[coeff])\n"
+ " pshufw $0x55*3, %%mm3, %%mm1\n"
+ " movq %%mm1, 24(%[coeff])\n"
+ :
+ : [s2_8] "r" (s2_8),
+ [s3_2] "r" (s3_2),
+ [coeff] "r" (tmp)
+ : "ecx");
+
+ __asm__ __volatile__("\n"
+ " .p2align 4,,15 \n"
+ "1:\n"
+ /* load 128 */
+ " pshufw $0x00, %%mm6, %%mm2\n"
+
+ " movd 0(%[s1_np7]), %%mm0\n"
+ " punpcklbw %%mm7, %%mm0\n"
+ " movd 7(%[s1_np7]), %%mm1\n"
+ " punpcklbw %%mm7, %%mm1\n"
+ " paddw %%mm1, %%mm0\n"
+ " pmullw 0(%[coeff]), %%mm0\n"
+ " paddw %%mm0, %%mm2\n"
+
+ " movd 1(%[s1_np7]), %%mm0\n"
+ " punpcklbw %%mm7, %%mm0\n"
+ " movd 6(%[s1_np7]), %%mm1\n"
+ " punpcklbw %%mm7, %%mm1\n"
+ " paddw %%mm1, %%mm0\n"
+ " pmullw 8(%[coeff]), %%mm0\n"
+ " paddw %%mm0, %%mm2\n"
+
+ " movd 2(%[s1_np7]), %%mm0\n"
+ " punpcklbw %%mm7, %%mm0\n"
+ " movd 5(%[s1_np7]), %%mm1\n"
+ " punpcklbw %%mm7, %%mm1\n"
+ " paddw %%mm1, %%mm0\n"
+ " pmullw 16(%[coeff]), %%mm0\n"
+ " paddw %%mm0, %%mm2\n"
+
+ " movd 3(%[s1_np7]), %%mm0\n"
+ " punpcklbw %%mm7, %%mm0\n"
+ " movd 4(%[s1_np7]), %%mm1\n"
+ " punpcklbw %%mm7, %%mm1\n"
+ " paddw %%mm1, %%mm0\n"
+ " pmullw 24(%[coeff]), %%mm0\n"
+ " paddw %%mm0, %%mm2\n"
+
+ " psraw %%mm5, %%mm2\n"
+ " pmaxsw %%mm7, %%mm2\n"
+ " packuswb %%mm2, %%mm2\n"
+ " movd %%mm2, 0(%[d])\n"
+ " add $4, %[d]\n"
+ " add $4, %[s1_np7]\n"
+ " decl %[n]\n"
+ " jnz 1b\n"
+ " emms\n"
+ : [d] "+r" (d),
+ [s1_np7] "+r" (s1_np7),
+ [n] "+m" (n)
+ : [s2_8] "r" (s2_8),
+ [coeff] "r" (tmp)
+ : "ecx");
+}
+OIL_DEFINE_IMPL_FULL (mas8_u8_sym_mmx_41, mas8_u8_sym_l15, OIL_IMPL_FLAG_MMX|OIL_IMPL_FLAG_MMXEXT);
+
+
+#define PSHUFW_3210 "0xe4"
+#define PSHUFW_0123 "0x1b"
+
+void
+mas8_u8_sym_mmx_5 (uint8_t *d, const uint8_t *s1_np7, const int16_t *s2_8,
+ const int16_t *s3_2, int n)
+{
+ if (n==0) return;
+ __asm__ __volatile__("\n"
+ " pxor %%mm7, %%mm7\n"
+
+ " movzwl 0(%[s3_2]), %%ecx\n"
+ " movd %%ecx, %%mm6\n"
+ " pshufw $0x44, %%mm6, %%mm6\n" // 01 00 01 00
+
+ " movzwl 2(%[s3_2]), %%ecx\n"
+ " movd %%ecx, %%mm5\n"
+
+ " cmpl $0, %[n]\n"
+ " jz 2f\n"
+
+ "1:\n"
+ " movd 0(%[s1_np7]), %%mm0\n"
+ " punpcklbw %%mm7, %%mm0\n"
+#if 1
+ " movd 4(%[s1_np7]), %%mm1\n"
+ " punpcklbw %%mm7, %%mm1\n"
+ " pshufw $0x1b, %%mm1, %%mm1\n" // 00 01 10 11
+ " paddw %%mm1, %%mm0\n"
+ " pmaddwd 0(%[s2_8]), %%mm0\n"
+#else
+ " pmaddwd 0(%[s2_8]), %%mm0\n"
+
+ " movd 4(%[s1_np7]), %%mm1\n"
+ " punpcklbw %%mm7, %%mm1\n"
+ " pmaddwd 8(%[s2_8]), %%mm1\n"
+ " paddd %%mm1, %%mm0\n"
+#endif
+
+ " pshufw $0xee, %%mm0, %%mm1\n" // 11 10 11 10
+ " paddd %%mm1, %%mm0\n"
+ " paddd %%mm6, %%mm0\n"
+
+ " psrad %%mm5, %%mm0\n"
+ " pmaxsw %%mm7, %%mm0\n"
+ " packuswb %%mm0, %%mm0\n"
+ " movd %%mm0, %%ecx\n"
+ " movb %%cl,0(%[d])\n"
+
+ " add $1, %[d]\n"
+ " add $1, %[s1_np7]\n"
+ " decl %[n]\n"
+ " jnz 1b\n"
+
+ "2:\n"
+ " emms\n"
+ : [d] "+r" (d),
+ [s1_np7] "+r" (s1_np7),
+ [n] "+m" (n)
+ : [s2_8] "r" (s2_8),
+ [s3_2] "r" (s3_2)
+ : "ecx");
+}
+OIL_DEFINE_IMPL_FULL (mas8_u8_sym_mmx_5, mas8_u8_sym_l15, OIL_IMPL_FLAG_MMX|OIL_IMPL_FLAG_MMXEXT);
+
+void
+mas8_u8_sym_mmx_6 (uint8_t *d, const uint8_t *s1_np7, const int16_t *s2_8,
+ const int16_t *s3_2, int n)
+{
+ int8_t coeff[8];
+ int8_t *ack;
+ int i;
+
+ for(i=0;i<8;i++){
+ //coeff[i] = s2_8[i];
+ coeff[i] = i;
+ }
+ ack = coeff;
+
+ if (n==0) return;
+ __asm__ __volatile__("\n"
+ " pxor %%mm7, %%mm7\n"
+
+ " movzwl 0(%[s3_2]), %%ecx\n"
+ " movd %%ecx, %%mm6\n"
+ " pshufw $0x44, %%mm6, %%mm6\n" // 01 00 01 00
+
+ " movzwl 2(%[s3_2]), %%ecx\n"
+ " movd %%ecx, %%mm5\n"
+
+ " movq 0(%[s2_8]), %%mm4\n"
+ " packsswb 8(%[s2_8]), %%mm4\n"
+
+ "1:\n"
+ " movq 0(%[s1_np7]), %%mm0\n"
+ " pmaddubsw %%mm4, %%mm0\n"
+
+#if 1
+ " pshufw $0xee, %%mm0, %%mm1\n" // 11 10 11 10
+ " paddw %%mm1, %%mm0\n"
+ " pshufw $0x55, %%mm0, %%mm1\n" // 01 01 01 01
+ " paddw %%mm1, %%mm0\n"
+#else
+ " phaddw %%mm0, %%mm0\n"
+ " phaddw %%mm0, %%mm0\n"
+#endif
+
+ " paddw %%mm6, %%mm0\n"
+ " psraw %%mm5, %%mm0\n"
+ " pmaxsw %%mm7, %%mm0\n"
+ " packuswb %%mm0, %%mm0\n"
+ " movd %%mm0, %%ecx\n"
+ " movb %%cl,0(%[d])\n"
+
+ " add $1, %[d]\n"
+ " add $1, %[s1_np7]\n"
+ " decl %[n]\n"
+ " jnz 1b\n"
+
+ " emms\n"
+ : [d] "+r" (d),
+ [s1_np7] "+r" (s1_np7),
+ [n] "+m" (n)
+ : [s2_8] "r" (s2_8),
+ [s3_2] "r" (s3_2)
+ : "ecx");
+}
+OIL_DEFINE_IMPL_FULL (mas8_u8_sym_mmx_6, mas8_u8_sym_l15, OIL_IMPL_FLAG_MMX|OIL_IMPL_FLAG_MMXEXT|OIL_IMPL_FLAG_SSSE3);
+
+#ifdef ENABLE_BROKEN_IMPLS
+/* This only works for the taps array: -1, 3, -7, 21, 21, -7, 3, -1 */
+void
+mas8_u8_supersym_mmx (uint8_t *d, const uint8_t *s1_np7, const int16_t *s2_8,
+ const int16_t *s3_2, int n)
+{
+ int j;
+ int x;
+
+ while(n&3) {
+ x = 0;
+ for(j=0;j<8;j++){
+ x += s1_np7[j] * s2_8[j];
+ }
+ *d = CLAMP((x + s3_2[0])>>s3_2[1],0,255);
+ d++;
+ s1_np7++;
+ n--;
+ }
+
+ n>>=2;
+ __asm__ __volatile__("\n"
+ " pxor %%mm7, %%mm7\n"
+
+ " movd (%[s3_2]), %%mm6\n"
+ " pshufw $0x00, %%mm6, %%mm6\n"
+
+ " movzwl 2(%[s3_2]), %%ecx\n"
+ " movd %%ecx, %%mm5\n"
+
+ " movq 0(%[s2_8]), %%mm3\n"
+ " movq 8(%[s2_8]), %%mm4\n"
+
+ "1:\n"
+ " movd 0(%[s1_np7]), %%mm0\n"
+ " punpcklbw %%mm7, %%mm0\n"
+ " movd 7(%[s1_np7]), %%mm1\n"
+ " punpcklbw %%mm7, %%mm1\n"
+ " paddw %%mm1, %%mm0\n"
+
+ " movd 1(%[s1_np7]), %%mm2\n"
+ " punpcklbw %%mm7, %%mm2\n"
+ " movd 6(%[s1_np7]), %%mm3\n"
+ " punpcklbw %%mm7, %%mm3\n"
+ " paddw %%mm3, %%mm2\n"
+
+ " paddw %%mm2, %%mm0\n"
+ " psllw $2, %%mm2\n"
+ " psubw %%mm0, %%mm2\n"
+ " movq %%mm2, %%mm4\n"
+
+ " movd 2(%[s1_np7]), %%mm0\n"
+ " punpcklbw %%mm7, %%mm0\n"
+ " movd 5(%[s1_np7]), %%mm1\n"
+ " punpcklbw %%mm7, %%mm1\n"
+ " paddw %%mm1, %%mm0\n"
+
+ " movd 3(%[s1_np7]), %%mm2\n"
+ " punpcklbw %%mm7, %%mm2\n"
+ " movd 4(%[s1_np7]), %%mm3\n"
+ " punpcklbw %%mm7, %%mm3\n"
+ " paddw %%mm3, %%mm2\n"
+
+ " paddw %%mm2, %%mm0\n"
+ " psllw $2, %%mm2\n"
+ " psubw %%mm0, %%mm2\n"
+
+ " psubw %%mm2, %%mm4\n"
+ " psllw $3, %%mm2\n"
+ " paddw %%mm4, %%mm2\n"
+
+ " paddw %%mm6, %%mm2\n"
+
+ " psraw %%mm5, %%mm2\n"
+ " pmaxsw %%mm7, %%mm2\n"
+ " packuswb %%mm2, %%mm2\n"
+ " movd %%mm2, 0(%[d])\n"
+ " add $4, %[d]\n"
+ " add $4, %[s1_np7]\n"
+ " decl %[n]\n"
+ " jnz 1b\n"
+ " emms\n"
+ : [d] "+r" (d),
+ [s1_np7] "+r" (s1_np7),
+ [n] "+m" (n)
+ : [s2_8] "r" (s2_8),
+ [s3_2] "r" (s3_2)
+ : "ecx");
+}
+OIL_DEFINE_IMPL_FULL (mas8_u8_supersym_mmx, mas8_u8_sym_l15, OIL_IMPL_FLAG_MMX|OIL_IMPL_FLAG_MMXEXT);
+#endif
+
+void
+mas12_addc_rshift_decim2_u8_mmx_4 (uint8_t *d, const uint8_t *s1_2xnp11,
+ const int16_t *s2_12, const int16_t *s3_2, int n)
+{
+ if (n == 0) return;
+ __asm__ __volatile__("\n"
+ " pxor %%mm7, %%mm7\n"
+
+ " movzwl 0(%[s3_2]), %%ecx\n"
+ " movd %%ecx, %%mm6\n"
+
+ " movzwl 2(%[s3_2]), %%ecx\n"
+ " movd %%ecx, %%mm5\n"
+
+ "1:\n"
+ " movd 0(%[s1_2xnp11]), %%mm0\n"
+ " punpcklbw %%mm7, %%mm0\n"
+ " pmaddwd 0(%[s2_12]), %%mm0\n"
+
+ " movd 4(%[s1_2xnp11]), %%mm1\n"
+ " punpcklbw %%mm7, %%mm1\n"
+ " pmaddwd 8(%[s2_12]), %%mm1\n"
+ " paddd %%mm1, %%mm0\n"
+
+ " movd 8(%[s1_2xnp11]), %%mm1\n"
+ " punpcklbw %%mm7, %%mm1\n"
+ " pmaddwd 16(%[s2_12]), %%mm1\n"
+ " paddd %%mm1, %%mm0\n"
+
+ " movq %%mm0, %%mm1\n"
+ " psrlq $32, %%mm0\n"
+ " paddd %%mm1, %%mm0\n"
+ " paddd %%mm6, %%mm0\n"
+
+ " psrad %%mm5, %%mm0\n"
+ " pmaxsw %%mm7, %%mm0\n"
+ " packuswb %%mm0, %%mm0\n"
+ " movd %%mm0, %%ecx\n"
+ " movb %%cl,0(%[d])\n"
+
+ " add $1, %[d]\n"
+ " add $2, %[s1_2xnp11]\n"
+ " decl %[n]\n"
+ " jnz 1b\n"
+ " emms\n"
+ : [d] "+r" (d),
+ [s1_2xnp11] "+r" (s1_2xnp11),
+ [n] "+m" (n)
+ : [s2_12] "r" (s2_12),
+ [s3_2] "r" (s3_2)
+ : "ecx");
+}
+OIL_DEFINE_IMPL_FULL (mas12_addc_rshift_decim2_u8_mmx_4,
+ mas12_addc_rshift_decim2_u8, OIL_IMPL_FLAG_MMX|OIL_IMPL_FLAG_MMXEXT);
+
+#if 0
+void
+mas8_addc_rshift_decim2_u8_mmx_4 (uint8_t *d, const uint8_t *s1_2xnp9,
+ const int16_t *s2_8, const int16_t *s3_2, int n)
+{
+ if (n == 0) return;
+ __asm__ __volatile__("\n"
+ " pxor %%mm7, %%mm7\n"
+
+ " movzwl 0(%[s3_2]), %%ecx\n"
+ " movd %%ecx, %%mm6\n"
+
+ " movzwl 2(%[s3_2]), %%ecx\n"
+ " movd %%ecx, %%mm5\n"
+
+ "1:\n"
+ " movd 0(%[s1_2xnp9]), %%mm0\n"
+ " punpcklbw %%mm7, %%mm0\n"
+ " pmaddwd 0(%[s2_8]), %%mm0\n"
+
+ " movd 4(%[s1_2xnp9]), %%mm1\n"
+ " punpcklbw %%mm7, %%mm1\n"
+ " pmaddwd 8(%[s2_8]), %%mm1\n"
+ " paddd %%mm1, %%mm0\n"
+
+ " movq %%mm0, %%mm1\n"
+ " psrlq $32, %%mm0\n"
+ " paddd %%mm1, %%mm0\n"
+ " paddd %%mm6, %%mm0\n"
+
+ " psrad %%mm5, %%mm0\n"
+ " pmaxsw %%mm7, %%mm0\n"
+ " packuswb %%mm0, %%mm0\n"
+ " movd %%mm0, %%ecx\n"
+ " movb %%cl,0(%[d])\n"
+
+ " add $1, %[d]\n"
+ " add $2, %[s1_2xnp9]\n"
+ " decl %[n]\n"
+ " jnz 1b\n"
+ " emms\n"
+ : [d] "+r" (d),
+ [s1_2xnp9] "+r" (s1_2xnp9),
+ [n] "+m" (n)
+ : [s2_8] "r" (s2_8),
+ [s3_2] "r" (s3_2)
+ : "ecx");
+}
+OIL_DEFINE_IMPL_FULL (mas8_addc_rshift_decim2_u8_mmx_4,
+ mas8_addc_rshift_decim2_u8, OIL_IMPL_FLAG_MMX|OIL_IMPL_FLAG_MMXEXT);
+
+#endif
+
+void
+mas8_across_u8_mmx_3 (uint8_t *d, const uint8_t *s1_nx8, int ss1,
+ const int16_t *s2_8, const int16_t *s3_2, int n)
+{
+ int i;
+ int x;
+
+ while(n&3) {
+ x = 0;
+ for(i=0;i<8;i++){
+ x += OIL_GET(s1_nx8, i*ss1, uint8_t)*s2_8[i];
+ }
+ *d = CLAMP((x + s3_2[0])>>s3_2[1],0,255);
+ d++;
+ s1_nx8++;
+ n--;
+ }
+
+ if (n == 0) return;
+ n>>=2;
+ __asm__ __volatile__("\n"
+ " pxor %%mm7, %%mm7\n"
+
+ " movd (%[s3_2]), %%mm6\n"
+
+ " movzwl 2(%[s3_2]), %%ecx\n"
+ " movd %%ecx, %%mm5\n"
+
+ " movq 0(%[s2_8]), %%mm3\n"
+ " movq 8(%[s2_8]), %%mm4\n"
+ :
+ : [s2_8] "r" (s2_8),
+ [s3_2] "r" (s3_2)
+ : "ecx");
+
+ while (n > 0) {
+ const uint8_t *p = s1_nx8;
+ __asm__ __volatile__("\n"
+ "1:\n"
+ /* load 128 */
+ " pshufw $0x00, %%mm6, %%mm2\n"
+
+ " movd 0(%[p]), %%mm0\n"
+ " add %[ss1], %[p]\n"
+ " punpcklbw %%mm7, %%mm0\n"
+ " pshufw $0x00, %%mm3, %%mm1\n"
+ " pmullw %%mm1, %%mm0\n"
+ " paddw %%mm0, %%mm2\n"
+
+ " movd 0(%[p]), %%mm0\n"
+ " add %[ss1], %[p]\n"
+ " punpcklbw %%mm7, %%mm0\n"
+ " pshufw $0x55*1, %%mm3, %%mm1\n"
+ " pmullw %%mm1, %%mm0\n"
+ " paddw %%mm0, %%mm2\n"
+
+ " movd 0(%[p]), %%mm0\n"
+ " add %[ss1], %[p]\n"
+ " punpcklbw %%mm7, %%mm0\n"
+ " pshufw $0x55*2, %%mm3, %%mm1\n"
+ " pmullw %%mm1, %%mm0\n"
+ " paddw %%mm0, %%mm2\n"
+
+ " movd 0(%[p]), %%mm0\n"
+ " add %[ss1], %[p]\n"
+ " punpcklbw %%mm7, %%mm0\n"
+ " pshufw $0x55*3, %%mm3, %%mm1\n"
+ " pmullw %%mm1, %%mm0\n"
+ " paddw %%mm0, %%mm2\n"
+
+ " movd 0(%[p]), %%mm0\n"
+ " add %[ss1], %[p]\n"
+ " punpcklbw %%mm7, %%mm0\n"
+ " pshufw $0x00, %%mm4, %%mm1\n"
+ " pmullw %%mm1, %%mm0\n"
+ " paddw %%mm0, %%mm2\n"
+
+ " movd 0(%[p]), %%mm0\n"
+ " add %[ss1], %[p]\n"
+ " punpcklbw %%mm7, %%mm0\n"
+ " pshufw $0x55*1, %%mm4, %%mm1\n"
+ " pmullw %%mm1, %%mm0\n"
+ " paddw %%mm0, %%mm2\n"
+
+ " movd 0(%[p]), %%mm0\n"
+ " add %[ss1], %[p]\n"
+ " punpcklbw %%mm7, %%mm0\n"
+ " pshufw $0x55*2, %%mm4, %%mm1\n"
+ " pmullw %%mm1, %%mm0\n"
+ " paddw %%mm0, %%mm2\n"
+
+ " movd 0(%[p]), %%mm0\n"
+ " add %[ss1], %[p]\n"
+ " punpcklbw %%mm7, %%mm0\n"
+ " pshufw $0x55*3, %%mm4, %%mm1\n"
+ " pmullw %%mm1, %%mm0\n"
+ " paddw %%mm0, %%mm2\n"
+
+ " psraw %%mm5, %%mm2\n"
+ " pmaxsw %%mm7, %%mm2\n"
+ " packuswb %%mm2, %%mm2\n"
+ " movd %%mm2, 0(%[d])\n"
+ : [p] "+r" (p)
+ : [d] "r" (d), [ss1] "r" ((long)ss1));
+ d+=4;
+ s1_nx8+=4;
+ n--;
+ }
+
+ asm volatile ("emms");
+}
+OIL_DEFINE_IMPL_FULL (mas8_across_u8_mmx_3, mas8_across_u8, OIL_IMPL_FLAG_MMX|OIL_IMPL_FLAG_MMXEXT);
+
diff --git a/liboil/i386_amd64/multiply_and_acc.c b/liboil/i386_amd64/multiply_and_acc.c
new file mode 100644
index 0000000..c7d6f7f
--- /dev/null
+++ b/liboil/i386_amd64/multiply_and_acc.c
@@ -0,0 +1,250 @@
+
+#include <liboil/liboilfunction.h>
+#include <liboil/liboilclasses.h>
+
+void
+multiply_and_acc_6xn_s16_u8_mmx (int16_t *i1, int is1, int16_t *s1,
+ int ss1, uint8_t *s2, int ss2, int n)
+{
+ /* FIXME this reads outside the arrays. Bad. */
+ if (n==0) return;
+ __asm__ __volatile__ ("\n"
+ " pxor %%mm7, %%mm7\n"
+ "1:\n"
+ " movd 0(%2), %%mm0\n"
+ " punpcklbw %%mm7, %%mm0\n"
+ " pmullw 0(%1), %%mm0\n"
+ " paddw 0(%0), %%mm0\n"
+ " movq %%mm0, 0(%0)\n"
+ " movd 4(%2), %%mm1\n"
+ " punpcklbw %%mm7, %%mm1\n"
+ " pmullw 8(%1), %%mm1\n"
+ " paddw 8(%0), %%mm1\n"
+ " movd %%mm1, 8(%0)\n"
+
+ " add %4, %0\n"
+ " add %5, %1\n"
+ " add %6, %2\n"
+ " decl %3\n"
+ " jnz 1b\n"
+ " emms\n"
+ : "+r" (i1), "+r" (s1), "+r" (s2), "+m" (n)
+ : "r" ((long)is1), "r" ((long)ss1), "r" ((long)ss2)
+ );
+}
+OIL_DEFINE_IMPL_FULL (multiply_and_acc_6xn_s16_u8_mmx,
+ multiply_and_acc_6xn_s16_u8, OIL_IMPL_FLAG_MMX);
+
+void
+multiply_and_acc_8xn_s16_u8_mmx (int16_t *i1, int is1, int16_t *s1,
+ int ss1, uint8_t *s2, int ss2, int n)
+{
+ if (n==0) return;
+ __asm__ __volatile__ ("\n"
+ " pxor %%mm7, %%mm7\n"
+ "1:\n"
+ " movd 0(%2), %%mm0\n"
+ " punpcklbw %%mm7, %%mm0\n"
+ " pmullw 0(%1), %%mm0\n"
+ " paddw 0(%0), %%mm0\n"
+ " movq %%mm0, 0(%0)\n"
+ " movd 4(%2), %%mm1\n"
+ " punpcklbw %%mm7, %%mm1\n"
+ " pmullw 8(%1), %%mm1\n"
+ " paddw 8(%0), %%mm1\n"
+ " movq %%mm1, 8(%0)\n"
+
+ " add %4, %0\n"
+ " add %5, %1\n"
+ " add %6, %2\n"
+ " decl %3\n"
+ " jnz 1b\n"
+ " emms\n"
+ : "+r" (i1), "+r" (s1), "+r" (s2), "+m" (n)
+ : "r" ((long)is1), "r" ((long)ss1), "r" ((long)ss2)
+ );
+}
+OIL_DEFINE_IMPL_FULL (multiply_and_acc_8xn_s16_u8_mmx,
+ multiply_and_acc_8xn_s16_u8, OIL_IMPL_FLAG_MMX);
+
+void
+multiply_and_acc_16xn_s16_u8_mmx (int16_t *i1, int is1, int16_t *s1,
+ int ss1, uint8_t *s2, int ss2, int n)
+{
+ if (n==0) return;
+ __asm__ __volatile__ ("\n"
+ " pxor %%mm7, %%mm7\n"
+ "1:\n"
+ " movd 0(%2), %%mm0\n"
+ " punpcklbw %%mm7, %%mm0\n"
+ " pmullw 0(%1), %%mm0\n"
+ " paddw 0(%0), %%mm0\n"
+ " movq %%mm0, 0(%0)\n"
+ " movd 4(%2), %%mm1\n"
+ " punpcklbw %%mm7, %%mm1\n"
+ " pmullw 8(%1), %%mm1\n"
+ " paddw 8(%0), %%mm1\n"
+ " movq %%mm1, 8(%0)\n"
+ " movd 8(%2), %%mm2\n"
+ " punpcklbw %%mm7, %%mm2\n"
+ " pmullw 16(%1), %%mm2\n"
+ " paddw 16(%0), %%mm2\n"
+ " movq %%mm2, 16(%0)\n"
+ " movd 12(%2), %%mm2\n"
+ " punpcklbw %%mm7, %%mm2\n"
+ " pmullw 24(%1), %%mm2\n"
+ " paddw 24(%0), %%mm2\n"
+ " movq %%mm2, 24(%0)\n"
+
+ " add %4, %0\n"
+ " add %5, %1\n"
+ " add %6, %2\n"
+ " decl %3\n"
+ " jnz 1b\n"
+ " emms\n"
+ : "+r" (i1), "+r" (s1), "+r" (s2), "+m" (n)
+ : "r" ((long)is1), "r" ((long)ss1), "r" ((long)ss2)
+ );
+}
+OIL_DEFINE_IMPL_FULL (multiply_and_acc_16xn_s16_u8_mmx,
+ multiply_and_acc_16xn_s16_u8, OIL_IMPL_FLAG_MMX);
+
+void
+multiply_and_acc_24xn_s16_u8_mmx (int16_t *i1, int is1, int16_t *s1,
+ int ss1, uint8_t *s2, int ss2, int n)
+{
+ if (n==0) return;
+ __asm__ __volatile__ ("\n"
+ " pxor %%mm7, %%mm7\n"
+ "1:\n"
+ " movd 0(%2), %%mm0\n"
+ " punpcklbw %%mm7, %%mm0\n"
+ " pmullw 0(%1), %%mm0\n"
+ " paddw 0(%0), %%mm0\n"
+ " movq %%mm0, 0(%0)\n"
+ " movd 4(%2), %%mm1\n"
+ " punpcklbw %%mm7, %%mm1\n"
+ " pmullw 8(%1), %%mm1\n"
+ " paddw 8(%0), %%mm1\n"
+ " movq %%mm1, 8(%0)\n"
+ " movd 8(%2), %%mm2\n"
+ " punpcklbw %%mm7, %%mm2\n"
+ " pmullw 16(%1), %%mm2\n"
+ " paddw 16(%0), %%mm2\n"
+ " movq %%mm2, 16(%0)\n"
+ " movd 12(%2), %%mm2\n"
+ " punpcklbw %%mm7, %%mm2\n"
+ " pmullw 24(%1), %%mm2\n"
+ " paddw 24(%0), %%mm2\n"
+ " movq %%mm2, 24(%0)\n"
+ " movd 16(%2), %%mm2\n"
+ " punpcklbw %%mm7, %%mm2\n"
+ " pmullw 32(%1), %%mm2\n"
+ " paddw 32(%0), %%mm2\n"
+ " movq %%mm2, 32(%0)\n"
+ " movd 20(%2), %%mm2\n"
+ " punpcklbw %%mm7, %%mm2\n"
+ " pmullw 40(%1), %%mm2\n"
+ " paddw 40(%0), %%mm2\n"
+ " movq %%mm2, 40(%0)\n"
+
+ " add %4, %0\n"
+ " add %5, %1\n"
+ " add %6, %2\n"
+ " decl %3\n"
+ " jnz 1b\n"
+ " emms\n"
+ : "+r" (i1), "+r" (s1), "+r" (s2), "+m" (n)
+ : "r" ((long)is1), "r" ((long)ss1), "r" ((long)ss2)
+ );
+}
+OIL_DEFINE_IMPL_FULL (multiply_and_acc_24xn_s16_u8_mmx,
+ multiply_and_acc_24xn_s16_u8, OIL_IMPL_FLAG_MMX);
+
+
+#if 0
+void
+mas2_add_s16_sse (int16_t *d1, int16_t *s1, int16_t *s2, int16_t *s3_2,
+ int16_t *s4_2, int n)
+{
+ int shift = s4_2[1];
+
+ while (n&7) {
+ int x;
+
+ x = s4_2[0] + s2[0]*s3_2[0] + s2[1]*s3_2[1];
+ x >>= s4_2[1];
+ d1[0] = s1[0] + x;
+
+ d1++;
+ s1++;
+ s2++;
+ n--;
+ }
+ if (n==0) return;
+
+ n>>=3;
+ asm volatile ("\n"
+ " movzwl 0(%0), %%ecx\n"
+ " movd %%ecx, %%xmm7\n"
+ " pshuflw $0x00, %%xmm7, %%xmm7\n"
+ " pshufd $0x00, %%xmm7, %%xmm7\n"
+ " movzwl 2(%0), %%ecx\n"
+ " movd %%ecx, %%xmm6\n"
+ " pshuflw $0x00, %%xmm6, %%xmm6\n"
+ " pshufd $0x00, %%xmm6, %%xmm6\n"
+ " movzwl 0(%1), %%ecx\n"
+ " movd %%ecx, %%xmm5\n"
+ " pshuflw $0x44, %%xmm5, %%xmm5\n"
+ " pshufd $0x00, %%xmm5, %%xmm5\n"
+ :: "r" (s3_2), "r" (s4_2)
+ : "ecx"
+ );
+ asm volatile ("\n"
+ "1:\n"
+ " movdqu 0(%2), %%xmm0\n" // mm0 = s0, s1, s2, s3
+ " movdqu 0(%2), %%xmm1\n" // mm1 = s0, s1, s2, s3
+ " pmullw %%xmm7, %%xmm0\n" // mm0 = lo(s0*a0), lo(s1*a0), ...
+ " pmulhw %%xmm7, %%xmm1\n" // mm1 = hi(s0*a0), hi(s1*a0), ...
+ " movdqu %%xmm0, %%xmm2\n" // mm2 = lo(s0*a0), lo(s1*a0), ...
+ " punpcklwd %%xmm1, %%xmm0\n" // mm0 = s0*a0, s1*a0
+ " punpckhwd %%xmm1, %%xmm2\n" // mm2 = s2*a0, s3*a0
+ " movdqu %%xmm2, %%xmm1\n" // mm1 = s2*a0, s3*a0
+
+ " movdqu 2(%2), %%xmm2\n"
+ " movdqu 2(%2), %%xmm3\n"
+ " pmullw %%xmm6, %%xmm2\n"
+ " pmulhw %%xmm6, %%xmm3\n"
+ " movdqu %%xmm2, %%xmm4\n"
+ " punpcklwd %%xmm3, %%xmm2\n" // mm2 = s1*a1, s2*a1
+ " punpckhwd %%xmm3, %%xmm4\n" // mm4 = s3*a1, s4*a1
+ " movdqu %%xmm4, %%xmm3\n" // mm3 = s3*a1, s4*a1
+
+ " paddd %%xmm3, %%xmm1\n" // mm1 = s2*a0 + s3*a1, ...
+ " paddd %%xmm2, %%xmm0\n" // mm0 = s0*a0 + s1*a1, ...
+
+ " paddd %%xmm5, %%xmm1\n" // mm1 = s2*a0 + s3*a1 + offset, ...
+ " paddd %%xmm5, %%xmm0\n" // mm0 = s0*a0 + s1*a1 + offset, ...
+
+ " movd %4, %%xmm4\n"
+ " psrad %%xmm4, %%xmm1\n" // mm1 = (s2*a0 + s3*a1 + offset)>>shift, ...
+ " psrad %%xmm4, %%xmm0\n" // mm0 = (s0*a0 + s1*a1 + offset)>>shift, ...
+
+ " packssdw %%xmm1, %%xmm0\n"
+ " paddw 0(%1), %%xmm0\n"
+ " movdqu %%xmm0, 0(%0)\n"
+ " add $16, %0\n"
+ " add $16, %1\n"
+ " add $16, %2\n"
+ " decl %3\n"
+ " jnz 1b\n"
+ : "+r" (d1), "+r" (s1), "+r" (s2), "+r" (n)
+ : "r" (shift)
+ );
+}
+OIL_DEFINE_IMPL_FULL (mas2_add_s16_sse, mas2_add_s16, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT);
+#endif
+
+
+
+
More information about the Liboil-commit
mailing list