[PATCH] Optimize fetch_scanline_x8r8g8b8 with SSSE3 instruction
Liu Xinyun
xinyun.liu at intel.com
Wed Aug 11 01:31:47 PDT 2010
Signed-off-by: Liu Xinyun <xinyun.liu at intel.com>
Signed-off-by: Xu, Samuel <samuel.xu at intel.com>
Signed-off-by: Ma, ling <ling.ma at intel.com>
---
configure.ac | 56 +++
pixman/Makefile.am | 14 +
pixman/pixman-access-ssse3.S | 1119 ++++++++++++++++++++++++++++++++++++++++++
pixman/pixman-access.c | 6 +
4 files changed, 1195 insertions(+), 0 deletions(-)
create mode 100644 pixman/pixman-access-ssse3.S
diff --git a/configure.ac b/configure.ac
index 98c2783..ed03d24 100644
--- a/configure.ac
+++ b/configure.ac
@@ -370,6 +370,57 @@ fi
AM_CONDITIONAL(USE_SSE2, test $have_sse2_intrinsics = yes)
dnl ===========================================================================
+dnl Check for SSSE3
+
+if test "x$SSSE3_CFLAGS" = "x" ; then
+ if test "x$SUNCC" = "xyes"; then
+ # SSSE3 is enabled by default in the Sun Studio 64-bit environment
+ if test "$AMD64_ABI" = "no" ; then
+ SSSE3_CFLAGS="-xarch=ssse3"
+ fi
+ else
+ SSSE3_CFLAGS="-mmmx -mssse3 -Winline"
+ fi
+fi
+
+have_ssse3_intrinsics=no
+AC_MSG_CHECKING(whether to use SSSE3 intrinsics)
+xserver_save_CFLAGS=$CFLAGS
+CFLAGS="$SSSE3_CFLAGS $CFLAGS"
+
+AC_COMPILE_IFELSE([
+#if defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 2))
+# if !defined(__amd64__) && !defined(__x86_64__)
+# error "Need GCC >= 4.2 for SSSE3 intrinsics on x86"
+# endif
+#endif
+int main () {
+ __asm__ __volatile__("PSHUFB %xmm1,%xmm0;");
+ return 0;
+}], have_ssse3_intrinsics=yes)
+CFLAGS=$xserver_save_CFLAGS
+
+AC_ARG_ENABLE(ssse3,
+ [AC_HELP_STRING([--disable-ssse3],
+ [disable SSSE3 fast paths])],
+ [enable_ssse3=$enableval], [enable_ssse3=auto])
+
+if test $enable_ssse3 = no ; then
+ have_ssse3_intrinsics=disabled
+fi
+
+if test $have_ssse3_intrinsics = yes ; then
+ AC_DEFINE(USE_SSSE3, 1, [use SSSE3 compiler intrinsics])
+fi
+
+AC_MSG_RESULT($have_ssse3_intrinsics)
+if test $enable_ssse3 = yes && test $have_ssse3_intrinsics = no ; then
+ AC_MSG_ERROR([SSSE3 intrinsics not detected])
+fi
+
+AM_CONDITIONAL(USE_SSSE3, test $have_ssse3_intrinsics = yes)
+
+dnl ===========================================================================
dnl Other special flags needed when building code using MMX or SSE instructions
case $host_os in
solaris*)
@@ -395,6 +446,9 @@ case $host_os in
if test "x$SSE2_LDFLAGS" = "x" ; then
SSE2_LDFLAGS="$HWCAP_LDFLAGS"
fi
+ if test "x$SSSE3_LDFLAGS" = "x" ; then
+ SSSE3_LDFLAGS="$HWCAP_LDFLAGS"
+ fi
;;
esac
@@ -402,6 +456,8 @@ AC_SUBST(MMX_CFLAGS)
AC_SUBST(MMX_LDFLAGS)
AC_SUBST(SSE2_CFLAGS)
AC_SUBST(SSE2_LDFLAGS)
+AC_SUBST(SSSE3_CFLAGS)
+AC_SUBST(SSSE3_LDFLAGS)
dnl ===========================================================================
dnl Check for VMX/Altivec
diff --git a/pixman/Makefile.am b/pixman/Makefile.am
index a9de19f..ae7f1b0 100644
--- a/pixman/Makefile.am
+++ b/pixman/Makefile.am
@@ -93,6 +93,20 @@ libpixman_1_la_LIBADD += libpixman-sse2.la
ASM_CFLAGS_sse2=$(SSE2_CFLAGS)
endif
+# ssse33 code
+if USE_SSSE3
+noinst_LTLIBRARIES += libpixman-ssse3.la
+libpixman_ssse3_la_SOURCES = \
+ pixman-access-ssse3.S
+libpixman_ssse3_la_CFLAGS = $(DEP_CFLAGS) $(SSSE3_CFLAGS)
+libpixman_ssse3_la_LIBADD = $(DEP_LIBS)
+libpixman_1_la_LDFLAGS += $(SSSE3_LDFLAGS)
+libpixman_1_la_LIBADD += libpixman-ssse3.la
+
+ASM_CFLAGS_ssse3=$(SSSE3_CFLAGS)
+endif
+
+
# arm simd code
if USE_ARM_SIMD
noinst_LTLIBRARIES += libpixman-arm-simd.la
diff --git a/pixman/pixman-access-ssse3.S b/pixman/pixman-access-ssse3.S
new file mode 100644
index 0000000..f0ebd36
--- /dev/null
+++ b/pixman/pixman-access-ssse3.S
@@ -0,0 +1,1119 @@
+#ifndef MEMCPY_OR
+# define MEMCPY_OR fetch_scanline_x8r8g8b8_ssse3_fast_path
+#endif
+
+#ifndef L
+# define L(label) .L##label
+#endif
+
+#ifndef ALIGN
+# define ALIGN(n) .p2align n
+#endif
+
+#ifndef cfi_startproc
+# define cfi_startproc .cfi_startproc
+#endif
+
+#ifndef cfi_endproc
+# define cfi_endproc .cfi_endproc
+#endif
+
+#ifndef cfi_rel_offset
+# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off
+#endif
+
+#ifndef cfi_restore
+# define cfi_restore(reg) .cfi_restore reg
+#endif
+
+#ifndef cfi_adjust_cfa_offset
+# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off
+#endif
+
+#ifndef cfi_remember_state
+# define cfi_remember_state .cfi_remember_state
+#endif
+
+#ifndef cfi_restore_state
+# define cfi_restore_state .cfi_restore_state
+#endif
+
+#ifndef ENTRY
+# define ENTRY(name) \
+ .type name, @function; \
+ .globl name; \
+ .p2align 4; \
+name: \
+ cfi_startproc
+#endif
+
+#ifndef END
+# define END(name) \
+ cfi_endproc; \
+ .size name, .-name
+#endif
+
+#ifdef USE_AS_BCOPY
+# define SRC PARMS
+# define DEST SRC+4
+# define LEN DEST+4
+#else
+# define DEST PARMS
+# define SRC DEST+4
+# define LEN SRC+4
+#endif
+
+#define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+#define PUSH(REG) pushl REG; CFI_PUSH (REG)
+#define POP(REG) popl REG; CFI_POP (REG)
+
+# define PARMS 8 /* Preserve EBX. */
+# define ENTRANCE PUSH (%ebx);
+# define RETURN_END POP (%ebx); ret
+# define RETURN RETURN_END; CFI_PUSH (%ebx)
+# define JMPTBL(I, B) I - B
+
+/* Load an entry in a jump table into EBX and branch to it. TABLE is a
+ jump table with relative offsets. INDEX is a register contains the
+ index into the jump table. SCALE is the scale of INDEX. */
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
+ /* We first load PC into EBX. */ \
+ call __i686.get_pc_thunk.bx; \
+ /* Get the address of the jump table. */ \
+ addl $(TABLE - .), %ebx; \
+ shr $2, INDEX; \
+ /* Get the entry and convert the relative offset to the \
+ absolute address. */ \
+ addl (%ebx,INDEX,SCALE), %ebx; \
+ /* We loaded the jump table. Go. */ \
+ jmp *%ebx
+
+# define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE) \
+ addl $(TABLE - .), %ebx
+
+# define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE) \
+ shr $2, INDEX; \
+ addl (%ebx,INDEX,SCALE), %ebx; \
+ /* We loaded the jump table. Go. */ \
+ jmp *%ebx
+
+ .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax", at progbits
+ .globl __i686.get_pc_thunk.bx
+ .hidden __i686.get_pc_thunk.bx
+ ALIGN (4)
+ .type __i686.get_pc_thunk.bx, at function
+__i686.get_pc_thunk.bx:
+ movl (%esp), %ebx
+ ret
+
+ .section .text.ssse3,"ax", at progbits
+ENTRY (MEMCPY_OR)
+ ENTRANCE
+ movl LEN(%esp), %ecx
+ movl SRC(%esp), %eax
+ movl DEST(%esp), %edx
+
+ cmp $48, %ecx
+ jae L(48bytesormore)
+
+ cmp %dl, %al
+ jb L(bk_write)
+ add %ecx, %edx
+ add %ecx, %eax
+ BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
+L(bk_write):
+ BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
+
+ ALIGN (4)
+/* ECX > 32 and EDX is 4 byte aligned. */
+L(48bytesormore):
+ movdqu (%eax), %xmm0
+ PUSH (%edi)
+ mov $0xff000000, %edi
+ movd %edi, %xmm6
+ movl %edx, %edi
+ and $-16, %edx
+ PUSH (%esi)
+ add $16, %edx
+ movl %edi, %esi
+ sub %edx, %edi
+ add %edi, %ecx
+ sub %edi, %eax
+
+ mov %esi, %edi
+ pshufd $0, %xmm6, %xmm6
+ and $3, %edi
+ por %xmm6, %xmm0
+ jz L(aligned4bytes)
+ cmp $3, %edi
+ psrldq $1, %xmm6
+ jz L(aligned4bytes)
+ cmp $2, %edi
+ psrldq $1, %xmm6
+ jz L(aligned4bytes)
+ psrldq $1, %xmm6
+L(aligned4bytes):
+ mov %eax, %edi
+ and $0xf, %edi
+ jz L(shl_0)
+ sub %edi, %eax
+ call __i686.get_pc_thunk.bx
+ addl $(L(shl_table)- .), %ebx
+ movaps (%eax), %xmm1
+ addl (%ebx,%edi,4), %ebx
+ movdqu %xmm0, (%esi)
+ jmp *%ebx
+
+ ALIGN (4)
+L(shl_0):
+ movdqu %xmm0, (%esi)
+ xor %edi, %edi
+ POP (%esi)
+ cmp $127, %ecx
+ ja L(shl_0_gobble)
+ lea -32(%ecx), %ecx
+
+ movdqa (%eax, %edi), %xmm0
+ sub $32, %ecx
+ movdqa 16(%eax, %edi), %xmm1
+ por %xmm6, %xmm0
+ por %xmm6, %xmm1
+ movdqa %xmm0, (%edx, %edi)
+ movdqa %xmm1, 16(%edx, %edi)
+ lea 32(%edi), %edi
+ jb L(shl_0_end)
+
+ movdqa (%eax, %edi), %xmm0
+ sub $32, %ecx
+ movdqa 16(%eax, %edi), %xmm1
+ por %xmm6, %xmm0
+ por %xmm6, %xmm1
+ movdqa %xmm0, (%edx, %edi)
+ movdqa %xmm1, 16(%edx, %edi)
+ lea 32(%edi), %edi
+ jb L(shl_0_end)
+
+ movdqa (%eax, %edi), %xmm0
+ sub $32, %ecx
+ movdqa 16(%eax, %edi), %xmm1
+ por %xmm6, %xmm0
+ por %xmm6, %xmm1
+ movdqa %xmm0, (%edx, %edi)
+ movdqa %xmm1, 16(%edx, %edi)
+ lea 32(%edi), %edi
+ jb L(shl_0_end)
+
+ movdqa (%eax, %edi), %xmm0
+ sub $32, %ecx
+ movdqa 16(%eax, %edi), %xmm1
+ por %xmm6, %xmm0
+ por %xmm6, %xmm1
+ movdqa %xmm0, (%edx, %edi)
+ movdqa %xmm1, 16(%edx, %edi)
+ lea 32(%edi), %edi
+L(shl_0_end):
+ lea 32(%ecx), %ecx
+ add %ecx, %edi
+ add %edi, %edx
+ add %edi, %eax
+ add $4, %ecx
+ and $60, %ecx
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
+L(shl_0_gobble):
+ POP (%edi)
+ lea -128(%ecx), %ecx
+L(shl_0_gobble_cache_loop):
+ movdqa (%eax), %xmm0
+ sub $128, %ecx
+ movaps 0x10(%eax), %xmm1
+ por %xmm6, %xmm0
+ movaps 0x20(%eax), %xmm2
+ por %xmm6, %xmm1
+ movaps 0x30(%eax), %xmm3
+ por %xmm6, %xmm2
+ movdqa %xmm0, (%edx)
+ por %xmm6, %xmm3
+ movaps %xmm1, 0x10(%edx)
+ movaps %xmm2, 0x20(%edx)
+ movaps %xmm3, 0x30(%edx)
+ movaps 0x40(%eax), %xmm0
+ lea 0x80(%edx), %edx
+ movaps 0x50(%eax), %xmm1
+ por %xmm6, %xmm0
+ movaps 0x60(%eax), %xmm2
+ por %xmm6, %xmm1
+ movaps 0x70(%eax), %xmm3
+ por %xmm6, %xmm2
+ lea 0x80(%eax), %eax
+ movaps %xmm0, -0x40(%edx)
+ por %xmm6, %xmm3
+ movaps %xmm1, -0x30(%edx)
+ movaps %xmm2, -0x20(%edx)
+ movaps %xmm3, -0x10(%edx)
+ jae L(shl_0_gobble_cache_loop)
+ cmp $-0x40, %ecx
+ lea 0x80(%ecx), %ecx
+ jl L(shl_0_cache_less_64bytes)
+ movdqa (%eax), %xmm0
+ sub $0x40, %ecx
+ movdqa 0x10(%eax), %xmm1
+ por %xmm6, %xmm0
+ movdqa 0x20(%eax), %xmm2
+ por %xmm6, %xmm1
+ movdqa 0x30(%eax), %xmm3
+ por %xmm6, %xmm2
+ movdqa %xmm0, (%edx)
+ lea 0x40(%eax), %eax
+ movdqa %xmm1, 0x10(%edx)
+ por %xmm6, %xmm3
+ movdqa %xmm2, 0x20(%edx)
+ movdqa %xmm3, 0x30(%edx)
+ lea 0x40(%edx), %edx
+L(shl_0_cache_less_64bytes):
+ cmp $0x20, %ecx
+ jb L(shl_0_cache_less_32bytes)
+ movdqa (%eax), %xmm0
+ sub $0x20, %ecx
+ movdqa 0x10(%eax), %xmm1
+ por %xmm6, %xmm0
+ lea 0x20(%eax), %eax
+ por %xmm6, %xmm1
+ movdqa %xmm0, (%edx)
+ movdqa %xmm1, 0x10(%edx)
+ lea 0x20(%edx), %edx
+L(shl_0_cache_less_32bytes):
+ cmp $0x10, %ecx
+ jb L(shl_0_cache_less_16bytes)
+ movdqa (%eax), %xmm0
+ sub $0x10, %ecx
+ add $0x10, %eax
+ por %xmm6, %xmm0
+ movdqa %xmm0, (%edx)
+ add $0x10, %edx
+L(shl_0_cache_less_16bytes):
+ add %ecx, %edx
+ add %ecx, %eax
+ add $4, %ecx
+ and $60, %ecx
+ BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
+
+ ALIGN (4)
+L(shl_1):
+ BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+ lea -32(%ecx), %ecx
+ POP (%esi)
+ POP (%edi)
+ ALIGN (4)
+L(shl_1_loop):
+ movaps 16(%eax), %xmm2
+ sub $32, %ecx
+ movaps 32(%eax), %xmm3
+ lea 32(%eax), %eax
+ movdqa %xmm3, %xmm4
+ palignr $1, %xmm2, %xmm3
+ lea 32(%edx), %edx
+ palignr $1, %xmm1, %xmm2
+ por %xmm6, %xmm2
+ movaps %xmm2, -32(%edx)
+ por %xmm6, %xmm3
+ movaps %xmm3, -16(%edx)
+ jb L(shl_1_end)
+
+ movaps 16(%eax), %xmm2
+ sub $32, %ecx
+ movaps 32(%eax), %xmm3
+ lea 32(%eax), %eax
+ movdqa %xmm3, %xmm1
+ palignr $1, %xmm2, %xmm3
+ lea 32(%edx), %edx
+ palignr $1, %xmm4, %xmm2
+ por %xmm6, %xmm2
+ movaps %xmm2, -32(%edx)
+ por %xmm6, %xmm3
+ movaps %xmm3, -16(%edx)
+ jae L(shl_1_loop)
+L(shl_1_end):
+ lea 32(%ecx), %ecx
+ add %ecx, %edx
+ lea 1(%ecx, %eax), %eax
+ add $4, %ecx
+ and $60, %ecx
+ BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+ ALIGN (4)
+L(shl_2):
+ BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+ lea -32(%ecx), %ecx
+ POP (%esi)
+ POP (%edi)
+ ALIGN (4)
+L(shl_2_loop):
+ movaps 16(%eax), %xmm2
+ sub $32, %ecx
+ movaps 32(%eax), %xmm3
+ lea 32(%eax), %eax
+ movdqa %xmm3, %xmm4
+ palignr $2, %xmm2, %xmm3
+ lea 32(%edx), %edx
+ palignr $2, %xmm1, %xmm2
+ por %xmm6, %xmm2
+ movaps %xmm2, -32(%edx)
+ por %xmm6, %xmm3
+ movaps %xmm3, -16(%edx)
+ jb L(shl_2_end)
+
+ movaps 16(%eax), %xmm2
+ sub $32, %ecx
+ movaps 32(%eax), %xmm3
+ lea 32(%eax), %eax
+ movdqa %xmm3, %xmm1
+ palignr $2, %xmm2, %xmm3
+ lea 32(%edx), %edx
+ palignr $2, %xmm4, %xmm2
+ por %xmm6, %xmm2
+ movaps %xmm2, -32(%edx)
+ por %xmm6, %xmm3
+ movaps %xmm3, -16(%edx)
+ jae L(shl_2_loop)
+L(shl_2_end):
+ lea 32(%ecx), %ecx
+ add %ecx, %edx
+ lea 2(%ecx, %eax), %eax
+ add $4, %ecx
+ and $60, %ecx
+ BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+ ALIGN (4)
+L(shl_3):
+ BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+ lea -32(%ecx), %ecx
+ POP (%esi)
+ POP (%edi)
+ ALIGN (4)
+L(shl_3_loop):
+ movaps 16(%eax), %xmm2
+ sub $32, %ecx
+ movaps 32(%eax), %xmm3
+ lea 32(%eax), %eax
+ movdqa %xmm3, %xmm4
+ palignr $3, %xmm2, %xmm3
+ lea 32(%edx), %edx
+ palignr $3, %xmm1, %xmm2
+ por %xmm6, %xmm2
+ movaps %xmm2, -32(%edx)
+ por %xmm6, %xmm3
+ movaps %xmm3, -16(%edx)
+ jb L(shl_3_end)
+
+ movaps 16(%eax), %xmm2
+ sub $32, %ecx
+ movaps 32(%eax), %xmm3
+ lea 32(%eax), %eax
+ movdqa %xmm3, %xmm1
+ palignr $3, %xmm2, %xmm3
+ lea 32(%edx), %edx
+ palignr $3, %xmm4, %xmm2
+ por %xmm6, %xmm2
+ movaps %xmm2, -32(%edx)
+ por %xmm6, %xmm3
+ movaps %xmm3, -16(%edx)
+ jae L(shl_3_loop)
+L(shl_3_end):
+ lea 32(%ecx), %ecx
+ add %ecx, %edx
+ lea 3(%ecx, %eax), %eax
+ add $4, %ecx
+ and $60, %ecx
+ BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+ ALIGN (4)
+L(shl_4):
+ BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+ lea -32(%ecx), %ecx
+ POP (%esi)
+ POP (%edi)
+ ALIGN (4)
+L(shl_4_loop):
+ movaps 16(%eax), %xmm2
+ sub $32, %ecx
+ movaps 32(%eax), %xmm3
+ lea 32(%eax), %eax
+ movdqa %xmm3, %xmm4
+ palignr $4, %xmm2, %xmm3
+ lea 32(%edx), %edx
+ palignr $4, %xmm1, %xmm2
+ por %xmm6, %xmm2
+ movaps %xmm2, -32(%edx)
+ por %xmm6, %xmm3
+ movaps %xmm3, -16(%edx)
+ jb L(shl_4_end)
+
+ movaps 16(%eax), %xmm2
+ sub $32, %ecx
+ movaps 32(%eax), %xmm3
+ lea 32(%eax), %eax
+ movdqa %xmm3, %xmm1
+ palignr $4, %xmm2, %xmm3
+ lea 32(%edx), %edx
+ palignr $4, %xmm4, %xmm2
+ por %xmm6, %xmm2
+ movaps %xmm2, -32(%edx)
+ por %xmm6, %xmm3
+ movaps %xmm3, -16(%edx)
+ jae L(shl_4_loop)
+L(shl_4_end):
+ lea 32(%ecx), %ecx
+ add %ecx, %edx
+ lea 4(%ecx, %eax), %eax
+ add $4, %ecx
+ and $60, %ecx
+ BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+ ALIGN (4)
+L(shl_5):
+ BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+ lea -32(%ecx), %ecx
+ POP (%esi)
+ POP (%edi)
+ ALIGN (4)
+L(shl_5_loop):
+ movaps 16(%eax), %xmm2
+ sub $32, %ecx
+ movaps 32(%eax), %xmm3
+ lea 32(%eax), %eax
+ movdqa %xmm3, %xmm4
+ palignr $5, %xmm2, %xmm3
+ lea 32(%edx), %edx
+ palignr $5, %xmm1, %xmm2
+ por %xmm6, %xmm2
+ movaps %xmm2, -32(%edx)
+ por %xmm6, %xmm3
+ movaps %xmm3, -16(%edx)
+ jb L(shl_5_end)
+
+ movaps 16(%eax), %xmm2
+ sub $32, %ecx
+ movaps 32(%eax), %xmm3
+ lea 32(%eax), %eax
+ movdqa %xmm3, %xmm1
+ palignr $5, %xmm2, %xmm3
+ lea 32(%edx), %edx
+ palignr $5, %xmm4, %xmm2
+ por %xmm6, %xmm2
+ movaps %xmm2, -32(%edx)
+ por %xmm6, %xmm3
+ movaps %xmm3, -16(%edx)
+ jae L(shl_5_loop)
+L(shl_5_end):
+ lea 32(%ecx), %ecx
+ add %ecx, %edx
+ lea 5(%ecx, %eax), %eax
+ add $4, %ecx
+ and $60, %ecx
+ BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+ ALIGN (4)
+L(shl_6):
+ BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+ lea -32(%ecx), %ecx
+ POP (%esi)
+ POP (%edi)
+ ALIGN (4)
+L(shl_6_loop):
+ movaps 16(%eax), %xmm2
+ sub $32, %ecx
+ movaps 32(%eax), %xmm3
+ lea 32(%eax), %eax
+ movdqa %xmm3, %xmm4
+ palignr $6, %xmm2, %xmm3
+ lea 32(%edx), %edx
+ palignr $6, %xmm1, %xmm2
+ por %xmm6, %xmm2
+ movaps %xmm2, -32(%edx)
+ por %xmm6, %xmm3
+ movaps %xmm3, -16(%edx)
+ jb L(shl_6_end)
+
+ movaps 16(%eax), %xmm2
+ sub $32, %ecx
+ movaps 32(%eax), %xmm3
+ lea 32(%eax), %eax
+ movdqa %xmm3, %xmm1
+ palignr $6, %xmm2, %xmm3
+ lea 32(%edx), %edx
+ palignr $6, %xmm4, %xmm2
+ por %xmm6, %xmm2
+ movaps %xmm2, -32(%edx)
+ por %xmm6, %xmm3
+ movaps %xmm3, -16(%edx)
+ jae L(shl_6_loop)
+L(shl_6_end):
+ lea 32(%ecx), %ecx
+ add %ecx, %edx
+ lea 6(%ecx, %eax), %eax
+ add $4, %ecx
+ and $60, %ecx
+ BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+ ALIGN (4)
+L(shl_7):
+ BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+ lea -32(%ecx), %ecx
+ POP (%esi)
+ POP (%edi)
+ ALIGN (4)
+L(shl_7_loop):
+ movaps 16(%eax), %xmm2
+ sub $32, %ecx
+ movaps 32(%eax), %xmm3
+ lea 32(%eax), %eax
+ movdqa %xmm3, %xmm4
+ palignr $7, %xmm2, %xmm3
+ lea 32(%edx), %edx
+ palignr $7, %xmm1, %xmm2
+ por %xmm6, %xmm2
+ movaps %xmm2, -32(%edx)
+ por %xmm6, %xmm3
+ movaps %xmm3, -16(%edx)
+ jb L(shl_7_end)
+
+ movaps 16(%eax), %xmm2
+ sub $32, %ecx
+ movaps 32(%eax), %xmm3
+ lea 32(%eax), %eax
+ movdqa %xmm3, %xmm1
+ palignr $7, %xmm2, %xmm3
+ lea 32(%edx), %edx
+ palignr $7, %xmm4, %xmm2
+ por %xmm6, %xmm2
+ movaps %xmm2, -32(%edx)
+ por %xmm6, %xmm3
+ movaps %xmm3, -16(%edx)
+ jae L(shl_7_loop)
+L(shl_7_end):
+ lea 32(%ecx), %ecx
+ add %ecx, %edx
+ lea 7(%ecx, %eax), %eax
+ add $4, %ecx
+ and $60, %ecx
+ BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+ ALIGN (4)
+L(shl_8):
+ BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+ lea -32(%ecx), %ecx
+ POP (%esi)
+ POP (%edi)
+ ALIGN (4)
+L(shl_8_loop):
+ movaps 16(%eax), %xmm2
+ sub $32, %ecx
+ movaps 32(%eax), %xmm3
+ lea 32(%eax), %eax
+ movdqa %xmm3, %xmm4
+ palignr $8, %xmm2, %xmm3
+ lea 32(%edx), %edx
+ palignr $8, %xmm1, %xmm2
+ por %xmm6, %xmm2
+ movaps %xmm2, -32(%edx)
+ por %xmm6, %xmm3
+ movaps %xmm3, -16(%edx)
+ jb L(shl_8_end)
+
+ movaps 16(%eax), %xmm2
+ sub $32, %ecx
+ movaps 32(%eax), %xmm3
+ lea 32(%eax), %eax
+ movdqa %xmm3, %xmm1
+ palignr $8, %xmm2, %xmm3
+ lea 32(%edx), %edx
+ palignr $8, %xmm4, %xmm2
+ por %xmm6, %xmm2
+ movaps %xmm2, -32(%edx)
+ por %xmm6, %xmm3
+ movaps %xmm3, -16(%edx)
+ jae L(shl_8_loop)
+L(shl_8_end):
+ lea 32(%ecx), %ecx
+ add %ecx, %edx
+ lea 8(%ecx, %eax), %eax
+ add $4, %ecx
+ and $60, %ecx
+ BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+ ALIGN (4)
+L(shl_9):
+ BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+ lea -32(%ecx), %ecx
+ POP (%esi)
+ POP (%edi)
+ ALIGN (4)
+L(shl_9_loop):
+ movaps 16(%eax), %xmm2
+ sub $32, %ecx
+ movaps 32(%eax), %xmm3
+ lea 32(%eax), %eax
+ movdqa %xmm3, %xmm4
+ palignr $9, %xmm2, %xmm3
+ lea 32(%edx), %edx
+ palignr $9, %xmm1, %xmm2
+ por %xmm6, %xmm2
+ movaps %xmm2, -32(%edx)
+ por %xmm6, %xmm3
+ movaps %xmm3, -16(%edx)
+ jb L(shl_9_end)
+
+ movaps 16(%eax), %xmm2
+ sub $32, %ecx
+ movaps 32(%eax), %xmm3
+ lea 32(%eax), %eax
+ movdqa %xmm3, %xmm1
+ palignr $9, %xmm2, %xmm3
+ lea 32(%edx), %edx
+ palignr $9, %xmm4, %xmm2
+ por %xmm6, %xmm2
+ movaps %xmm2, -32(%edx)
+ por %xmm6, %xmm3
+ movaps %xmm3, -16(%edx)
+ jae L(shl_9_loop)
+L(shl_9_end):
+ lea 32(%ecx), %ecx
+ add %ecx, %edx
+ lea 9(%ecx, %eax), %eax
+ add $4, %ecx
+ and $60, %ecx
+ BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+ ALIGN (4)
+L(shl_10):
+ BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+ lea -32(%ecx), %ecx
+ POP (%esi)
+ POP (%edi)
+ ALIGN (4)
+L(shl_10_loop):
+ movaps 16(%eax), %xmm2
+ sub $32, %ecx
+ movaps 32(%eax), %xmm3
+ lea 32(%eax), %eax
+ movdqa %xmm3, %xmm4
+ palignr $10, %xmm2, %xmm3
+ lea 32(%edx), %edx
+ palignr $10, %xmm1, %xmm2
+ por %xmm6, %xmm2
+ movaps %xmm2, -32(%edx)
+ por %xmm6, %xmm3
+ movaps %xmm3, -16(%edx)
+ jb L(shl_10_end)
+
+ movaps 16(%eax), %xmm2
+ sub $32, %ecx
+ movaps 32(%eax), %xmm3
+ lea 32(%eax), %eax
+ movdqa %xmm3, %xmm1
+ palignr $10, %xmm2, %xmm3
+ lea 32(%edx), %edx
+ palignr $10, %xmm4, %xmm2
+ por %xmm6, %xmm2
+ movaps %xmm2, -32(%edx)
+ por %xmm6, %xmm3
+ movaps %xmm3, -16(%edx)
+ jae L(shl_10_loop)
+L(shl_10_end):
+ lea 32(%ecx), %ecx
+ add %ecx, %edx
+ lea 10(%ecx, %eax), %eax
+ add $4, %ecx
+ and $60, %ecx
+ BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+ ALIGN (4)
+L(shl_11):
+ BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+ lea -32(%ecx), %ecx
+ POP (%esi)
+ POP (%edi)
+ ALIGN (4)
+L(shl_11_loop):
+ movaps 16(%eax), %xmm2
+ sub $32, %ecx
+ movaps 32(%eax), %xmm3
+ lea 32(%eax), %eax
+ movdqa %xmm3, %xmm4
+ palignr $11, %xmm2, %xmm3
+ lea 32(%edx), %edx
+ palignr $11, %xmm1, %xmm2
+ por %xmm6, %xmm2
+ movaps %xmm2, -32(%edx)
+ por %xmm6, %xmm3
+ movaps %xmm3, -16(%edx)
+ jb L(shl_11_end)
+
+ movaps 16(%eax), %xmm2
+ sub $32, %ecx
+ movaps 32(%eax), %xmm3
+ lea 32(%eax), %eax
+ movdqa %xmm3, %xmm1
+ palignr $11, %xmm2, %xmm3
+ lea 32(%edx), %edx
+ palignr $11, %xmm4, %xmm2
+ por %xmm6, %xmm2
+ movaps %xmm2, -32(%edx)
+ por %xmm6, %xmm3
+ movaps %xmm3, -16(%edx)
+ jae L(shl_11_loop)
+L(shl_11_end):
+ lea 32(%ecx), %ecx
+ add %ecx, %edx
+ lea 11(%ecx, %eax), %eax
+ add $4, %ecx
+ and $60, %ecx
+ BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+ ALIGN (4)
+L(shl_12):
+ BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+ lea -32(%ecx), %ecx
+ POP (%esi)
+ POP (%edi)
+ ALIGN (4)
+L(shl_12_loop):
+ movaps 16(%eax), %xmm2
+ sub $32, %ecx
+ movaps 32(%eax), %xmm3
+ lea 32(%eax), %eax
+ movdqa %xmm3, %xmm4
+ palignr $12, %xmm2, %xmm3
+ lea 32(%edx), %edx
+ palignr $12, %xmm1, %xmm2
+ por %xmm6, %xmm2
+ movaps %xmm2, -32(%edx)
+ por %xmm6, %xmm3
+ movaps %xmm3, -16(%edx)
+ jb L(shl_12_end)
+
+ movaps 16(%eax), %xmm2
+ sub $32, %ecx
+ movaps 32(%eax), %xmm3
+ lea 32(%eax), %eax
+ movdqa %xmm3, %xmm1
+ palignr $12, %xmm2, %xmm3
+ lea 32(%edx), %edx
+ palignr $12, %xmm4, %xmm2
+ por %xmm6, %xmm2
+ movaps %xmm2, -32(%edx)
+ por %xmm6, %xmm3
+ movaps %xmm3, -16(%edx)
+ jae L(shl_12_loop)
+L(shl_12_end):
+ lea 32(%ecx), %ecx
+ add %ecx, %edx
+ lea 12(%ecx, %eax), %eax
+ add $4, %ecx
+ and $60, %ecx
+ BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+ ALIGN (4)
+L(shl_13):
+ BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+ lea -32(%ecx), %ecx
+ POP (%esi)
+ POP (%edi)
+ ALIGN (4)
+L(shl_13_loop):
+ movaps 16(%eax), %xmm2
+ sub $32, %ecx
+ movaps 32(%eax), %xmm3
+ lea 32(%eax), %eax
+ movdqa %xmm3, %xmm4
+ palignr $13, %xmm2, %xmm3
+ lea 32(%edx), %edx
+ palignr $13, %xmm1, %xmm2
+ por %xmm6, %xmm2
+ movaps %xmm2, -32(%edx)
+ por %xmm6, %xmm3
+ movaps %xmm3, -16(%edx)
+ jb L(shl_13_end)
+
+ movaps 16(%eax), %xmm2
+ sub $32, %ecx
+ movaps 32(%eax), %xmm3
+ lea 32(%eax), %eax
+ movdqa %xmm3, %xmm1
+ palignr $13, %xmm2, %xmm3
+ lea 32(%edx), %edx
+ palignr $13, %xmm4, %xmm2
+ por %xmm6, %xmm2
+ movaps %xmm2, -32(%edx)
+ por %xmm6, %xmm3
+ movaps %xmm3, -16(%edx)
+ jae L(shl_13_loop)
+L(shl_13_end):
+ lea 32(%ecx), %ecx
+ add %ecx, %edx
+ lea 13(%ecx, %eax), %eax
+ add $4, %ecx
+ and $60, %ecx
+ BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+ ALIGN (4)
+L(shl_14):
+ BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+ lea -32(%ecx), %ecx
+ POP (%esi)
+ POP (%edi)
+ ALIGN (4)
+L(shl_14_loop):
+ movaps 16(%eax), %xmm2
+ sub $32, %ecx
+ movaps 32(%eax), %xmm3
+ lea 32(%eax), %eax
+ movdqa %xmm3, %xmm4
+ palignr $14, %xmm2, %xmm3
+ lea 32(%edx), %edx
+ palignr $14, %xmm1, %xmm2
+ por %xmm6, %xmm2
+ movaps %xmm2, -32(%edx)
+ por %xmm6, %xmm3
+ movaps %xmm3, -16(%edx)
+ jb L(shl_14_end)
+
+ movaps 16(%eax), %xmm2
+ sub $32, %ecx
+ movaps 32(%eax), %xmm3
+ lea 32(%eax), %eax
+ movdqa %xmm3, %xmm1
+ palignr $14, %xmm2, %xmm3
+ lea 32(%edx), %edx
+ palignr $14, %xmm4, %xmm2
+ por %xmm6, %xmm2
+ movaps %xmm2, -32(%edx)
+ por %xmm6, %xmm3
+ movaps %xmm3, -16(%edx)
+ jae L(shl_14_loop)
+L(shl_14_end):
+ lea 32(%ecx), %ecx
+ add %ecx, %edx
+ lea 14(%ecx, %eax), %eax
+ add $4, %ecx
+ and $60, %ecx
+ BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+ ALIGN (4)
+L(shl_15):
+ BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+ lea -32(%ecx), %ecx
+ POP (%esi)
+ POP (%edi)
+ ALIGN (4)
+L(shl_15_loop):
+ movaps 16(%eax), %xmm2
+ sub $32, %ecx
+ movaps 32(%eax), %xmm3
+ lea 32(%eax), %eax
+ movdqa %xmm3, %xmm4
+ palignr $15, %xmm2, %xmm3
+ lea 32(%edx), %edx
+ palignr $15, %xmm1, %xmm2
+ por %xmm6, %xmm2
+ movaps %xmm2, -32(%edx)
+ por %xmm6, %xmm3
+ movaps %xmm3, -16(%edx)
+ jb L(shl_15_end)
+
+ movaps 16(%eax), %xmm2
+ sub $32, %ecx
+ movaps 32(%eax), %xmm3
+ lea 32(%eax), %eax
+ movdqa %xmm3, %xmm1
+ palignr $15, %xmm2, %xmm3
+ lea 32(%edx), %edx
+ palignr $15, %xmm4, %xmm2
+ por %xmm6, %xmm2
+ movaps %xmm2, -32(%edx)
+ por %xmm6, %xmm3
+ movaps %xmm3, -16(%edx)
+ jae L(shl_15_loop)
+L(shl_15_end):
+ lea 32(%ecx), %ecx
+ add %ecx, %edx
+ lea 15(%ecx, %eax), %eax
+ add $4, %ecx
+ and $60, %ecx
+ BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+ ALIGN (4)
+L(fwd_write_44bytes):
+ movl $0xff000000, %ecx
+ or -44(%eax), %ecx
+ movl %ecx, -44(%edx)
+L(fwd_write_40bytes):
+ movl $0xff000000, %ecx
+ or -40(%eax), %ecx
+ movl %ecx, -40(%edx)
+L(fwd_write_36bytes):
+ movl $0xff000000, %ecx
+ or -36(%eax), %ecx
+ movl %ecx, -36(%edx)
+L(fwd_write_32bytes):
+ movl $0xff000000, %ecx
+ or -32(%eax), %ecx
+ movl %ecx, -32(%edx)
+L(fwd_write_28bytes):
+ movl $0xff000000, %ecx
+ or -28(%eax), %ecx
+ movl %ecx, -28(%edx)
+L(fwd_write_24bytes):
+ movl $0xff000000, %ecx
+ or -24(%eax), %ecx
+ movl %ecx, -24(%edx)
+L(fwd_write_20bytes):
+ movl $0xff000000, %ecx
+ or -20(%eax), %ecx
+ movl %ecx, -20(%edx)
+L(fwd_write_16bytes):
+ movl $0xff000000, %ecx
+ or -16(%eax), %ecx
+ movl %ecx, -16(%edx)
+L(fwd_write_12bytes):
+ movl $0xff000000, %ecx
+ or -12(%eax), %ecx
+ movl %ecx, -12(%edx)
+L(fwd_write_8bytes):
+ movl $0xff000000, %ecx
+ or -8(%eax), %ecx
+ movl %ecx, -8(%edx)
+L(fwd_write_4bytes):
+ movl $0xff000000, %ecx
+ or -4(%eax), %ecx
+ movl %ecx, -4(%edx)
+L(fwd_write_0bytes):
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+#endif
+
+ RETURN
+
+ ALIGN (4)
+L(bk_write_44bytes):
+ movl $0xff000000, %ecx
+ or 40(%eax), %ecx
+ movl %ecx, 40(%edx)
+L(bk_write_40bytes):
+ movl $0xff000000, %ecx
+ or 36(%eax), %ecx
+ movl %ecx, 36(%edx)
+L(bk_write_36bytes):
+ movl $0xff000000, %ecx
+ or 32(%eax), %ecx
+ movl %ecx, 32(%edx)
+L(bk_write_32bytes):
+ movl $0xff000000, %ecx
+ or 28(%eax), %ecx
+ movl %ecx, 28(%edx)
+L(bk_write_28bytes):
+ movl $0xff000000, %ecx
+ or 24(%eax), %ecx
+ movl %ecx, 24(%edx)
+L(bk_write_24bytes):
+ movl $0xff000000, %ecx
+ or 20(%eax), %ecx
+ movl %ecx, 20(%edx)
+L(bk_write_20bytes):
+ movl $0xff000000, %ecx
+ or 16(%eax), %ecx
+ movl %ecx, 16(%edx)
+L(bk_write_16bytes):
+ movl $0xff000000, %ecx
+ or 12(%eax), %ecx
+ movl %ecx, 12(%edx)
+L(bk_write_12bytes):
+ movl $0xff000000, %ecx
+ or 8(%eax), %ecx
+ movl %ecx, 8(%edx)
+L(bk_write_8bytes):
+ movl $0xff000000, %ecx
+ or 4(%eax), %ecx
+ movl %ecx, 4(%edx)
+L(bk_write_4bytes):
+ movl $0xff000000, %ecx
+ or (%eax), %ecx
+ movl %ecx, (%edx)
+L(bk_write_0bytes):
+#ifndef USE_AS_BCOPY
+ movl DEST(%esp), %eax
+# ifdef USE_AS_MEMPCPY
+ movl LEN(%esp), %ecx
+ add %ecx, %eax
+# endif
+#endif
+ RETURN
+
+
+ .pushsection .rodata.ssse3,"a", at progbits
+ ALIGN (2)
+L(table_48bytes_fwd):
+ .int JMPTBL (L(fwd_write_0bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_4bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_8bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_12bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_16bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_20bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_24bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_28bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_32bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_36bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_40bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_44bytes), L(table_48bytes_fwd))
+
+ ALIGN (2)
+L(shl_table):
+ .int JMPTBL (L(shl_0), L(shl_table))
+ .int JMPTBL (L(shl_1), L(shl_table))
+ .int JMPTBL (L(shl_2), L(shl_table))
+ .int JMPTBL (L(shl_3), L(shl_table))
+ .int JMPTBL (L(shl_4), L(shl_table))
+ .int JMPTBL (L(shl_5), L(shl_table))
+ .int JMPTBL (L(shl_6), L(shl_table))
+ .int JMPTBL (L(shl_7), L(shl_table))
+ .int JMPTBL (L(shl_8), L(shl_table))
+ .int JMPTBL (L(shl_9), L(shl_table))
+ .int JMPTBL (L(shl_10), L(shl_table))
+ .int JMPTBL (L(shl_11), L(shl_table))
+ .int JMPTBL (L(shl_12), L(shl_table))
+ .int JMPTBL (L(shl_13), L(shl_table))
+ .int JMPTBL (L(shl_14), L(shl_table))
+ .int JMPTBL (L(shl_15), L(shl_table))
+
+ ALIGN (2)
+L(table_48_bytes_bwd):
+ .int JMPTBL (L(bk_write_0bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_4bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_8bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_12bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_16bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_20bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_24bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_28bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_32bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_36bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_40bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_44bytes), L(table_48_bytes_bwd))
+
+ .popsection
+
+END (MEMCPY_OR)
diff --git a/pixman/pixman-access.c b/pixman/pixman-access.c
index 56de711..db085b6 100644
--- a/pixman/pixman-access.c
+++ b/pixman/pixman-access.c
@@ -113,10 +113,16 @@ fetch_scanline_x8r8g8b8 (pixman_image_t *image,
{
const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
const uint32_t *pixel = (const uint32_t *)bits + x;
+
+#if defined(USE_SSSE3) && !defined(PIXMAN_FB_ACCESSORS)
+ extern void *fetch_scanline_x8r8g8b8_ssse3_fast_path(void *dest, const void *src, int count);
+ fetch_scanline_x8r8g8b8_ssse3_fast_path(buffer, pixel, width * 4);
+#else
const uint32_t *end = pixel + width;
while (pixel < end)
*buffer++ = READ (image, pixel++) | 0xff000000;
+#endif
}
static void
--
1.7.0.4
More information about the Pixman
mailing list