[PATCH] Optimize fetch_scanline_x8r8g8b8 with SSSE3 instruction

Liu Xinyun xinyun.liu at intel.com
Wed Aug 11 01:31:47 PDT 2010


Signed-off-by: Liu Xinyun <xinyun.liu at intel.com>
Signed-off-by: Xu, Samuel <samuel.xu at intel.com>
Signed-off-by: Ma, ling <ling.ma at intel.com>
---
 configure.ac                 |   56 +++
 pixman/Makefile.am           |   14 +
 pixman/pixman-access-ssse3.S | 1119 ++++++++++++++++++++++++++++++++++++++++++
 pixman/pixman-access.c       |    6 +
 4 files changed, 1195 insertions(+), 0 deletions(-)
 create mode 100644 pixman/pixman-access-ssse3.S

diff --git a/configure.ac b/configure.ac
index 98c2783..ed03d24 100644
--- a/configure.ac
+++ b/configure.ac
@@ -370,6 +370,57 @@ fi
 AM_CONDITIONAL(USE_SSE2, test $have_sse2_intrinsics = yes)
 
 dnl ===========================================================================
+dnl Check for SSSE3
+
+if test "x$SSSE3_CFLAGS" = "x" ; then
+   if test "x$SUNCC" = "xyes"; then
+      # SSSE3 is enabled by default in the Sun Studio 64-bit environment
+      if test "$AMD64_ABI" = "no" ; then
+         SSSE3_CFLAGS="-xarch=ssse3"
+      fi
+   else
+      SSSE3_CFLAGS="-mmmx -mssse3 -Winline"
+   fi
+fi
+
+have_ssse3_intrinsics=no
+AC_MSG_CHECKING(whether to use SSSE3 intrinsics)
+xserver_save_CFLAGS=$CFLAGS
+CFLAGS="$SSSE3_CFLAGS $CFLAGS"
+
+AC_COMPILE_IFELSE([
+#if defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 2))
+#   if !defined(__amd64__) && !defined(__x86_64__)
+#      error "Need GCC >= 4.2 for SSSE3 intrinsics on x86"
+#   endif
+#endif
+int main () {
+    __asm__ __volatile__("PSHUFB %xmm1,%xmm0;");
+    return 0;
+}], have_ssse3_intrinsics=yes)
+CFLAGS=$xserver_save_CFLAGS
+
+AC_ARG_ENABLE(ssse3,
+   [AC_HELP_STRING([--disable-ssse3],
+                   [disable SSSE3 fast paths])],
+   [enable_ssse3=$enableval], [enable_ssse3=auto])
+
+if test $enable_ssse3 = no ; then
+   have_ssse3_intrinsics=disabled
+fi
+
+if test $have_ssse3_intrinsics = yes ; then
+   AC_DEFINE(USE_SSSE3, 1, [use SSSE3 compiler intrinsics])
+fi
+
+AC_MSG_RESULT($have_ssse3_intrinsics)
+if test $enable_ssse3 = yes && test $have_ssse3_intrinsics = no ; then
+   AC_MSG_ERROR([SSSE3 intrinsics not detected])
+fi
+
+AM_CONDITIONAL(USE_SSSE3, test $have_ssse3_intrinsics = yes)
+
+dnl ===========================================================================
 dnl Other special flags needed when building code using MMX or SSE instructions
 case $host_os in
    solaris*)
@@ -395,6 +446,9 @@ case $host_os in
       if test "x$SSE2_LDFLAGS" = "x" ; then
 	 SSE2_LDFLAGS="$HWCAP_LDFLAGS"
       fi
+      if test "x$SSSE3_LDFLAGS" = "x" ; then
+	 SSSE3_LDFLAGS="$HWCAP_LDFLAGS"
+      fi
       ;;
 esac
 
@@ -402,6 +456,8 @@ AC_SUBST(MMX_CFLAGS)
 AC_SUBST(MMX_LDFLAGS)
 AC_SUBST(SSE2_CFLAGS)
 AC_SUBST(SSE2_LDFLAGS)
+AC_SUBST(SSSE3_CFLAGS)
+AC_SUBST(SSSE3_LDFLAGS)
 
 dnl ===========================================================================
 dnl Check for VMX/Altivec
diff --git a/pixman/Makefile.am b/pixman/Makefile.am
index a9de19f..ae7f1b0 100644
--- a/pixman/Makefile.am
+++ b/pixman/Makefile.am
@@ -93,6 +93,20 @@ libpixman_1_la_LIBADD += libpixman-sse2.la
 ASM_CFLAGS_sse2=$(SSE2_CFLAGS)
 endif
 
+# ssse33 code
+if USE_SSSE3
+noinst_LTLIBRARIES += libpixman-ssse3.la
+libpixman_ssse3_la_SOURCES = \
+	 pixman-access-ssse3.S
+libpixman_ssse3_la_CFLAGS = $(DEP_CFLAGS) $(SSSE3_CFLAGS)
+libpixman_ssse3_la_LIBADD = $(DEP_LIBS)
+libpixman_1_la_LDFLAGS += $(SSSE3_LDFLAGS)
+libpixman_1_la_LIBADD += libpixman-ssse3.la
+
+ASM_CFLAGS_ssse3=$(SSSE3_CFLAGS)
+endif
+
+
 # arm simd code
 if USE_ARM_SIMD
 noinst_LTLIBRARIES += libpixman-arm-simd.la
diff --git a/pixman/pixman-access-ssse3.S b/pixman/pixman-access-ssse3.S
new file mode 100644
index 0000000..f0ebd36
--- /dev/null
+++ b/pixman/pixman-access-ssse3.S
@@ -0,0 +1,1119 @@
+#ifndef MEMCPY_OR
+# define MEMCPY_OR         fetch_scanline_x8r8g8b8_ssse3_fast_path
+#endif
+
+#ifndef L
+# define L(label)	.L##label
+#endif
+
+#ifndef ALIGN
+# define ALIGN(n)	.p2align n
+#endif
+
+#ifndef cfi_startproc
+# define cfi_startproc			.cfi_startproc
+#endif
+
+#ifndef cfi_endproc
+# define cfi_endproc			.cfi_endproc
+#endif
+
+#ifndef cfi_rel_offset
+# define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
+#endif
+
+#ifndef cfi_restore
+# define cfi_restore(reg)		.cfi_restore reg
+#endif
+
+#ifndef cfi_adjust_cfa_offset
+# define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
+#endif
+
+#ifndef cfi_remember_state
+# define cfi_remember_state		.cfi_remember_state
+#endif
+
+#ifndef cfi_restore_state
+# define cfi_restore_state		.cfi_restore_state
+#endif
+
+#ifndef ENTRY
+# define ENTRY(name)			\
+	.type name,  @function; 	\
+	.globl name;			\
+	.p2align 4;			\
+name:					\
+	cfi_startproc
+#endif
+
+#ifndef END
+# define END(name)			\
+	cfi_endproc;			\
+	.size name, .-name
+#endif
+
+#ifdef USE_AS_BCOPY
+# define SRC		PARMS
+# define DEST		SRC+4
+# define LEN		DEST+4
+#else
+# define DEST		PARMS
+# define SRC		DEST+4
+# define LEN		SRC+4
+#endif
+
+#define CFI_PUSH(REG)						\
+  cfi_adjust_cfa_offset (4);					\
+  cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG)						\
+  cfi_adjust_cfa_offset (-4);					\
+  cfi_restore (REG)
+
+#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+#define POP(REG)	popl REG; CFI_POP (REG)
+
+# define PARMS		8		/* Preserve EBX.  */
+# define ENTRANCE	PUSH (%ebx);
+# define RETURN_END	POP (%ebx); ret
+# define RETURN		RETURN_END; CFI_PUSH (%ebx)
+# define JMPTBL(I, B)	I - B
+
+/* Load an entry in a jump table into EBX and branch to it.  TABLE is a
+   jump table with relative offsets.  INDEX is a register contains the
+   index into the jump table.   SCALE is the scale of INDEX. */
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
+    /* We first load PC into EBX.  */				\
+    call	__i686.get_pc_thunk.bx;				\
+    /* Get the address of the jump table.  */			\
+    addl	$(TABLE - .), %ebx;				\
+    shr		$2, INDEX;				\
+    /* Get the entry and convert the relative offset to the	\
+       absolute address.  */					\
+    addl	(%ebx,INDEX,SCALE), %ebx;			\
+    /* We loaded the jump table.  Go.  */			\
+    jmp		*%ebx
+
+# define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE)			\
+    addl	$(TABLE - .), %ebx
+
+# define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE)	\
+    shr		$2, INDEX;					\
+    addl	(%ebx,INDEX,SCALE), %ebx;			\
+    /* We loaded the jump table.  Go.  */			\
+    jmp		*%ebx
+
+	.section	.gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax", at progbits
+	.globl	__i686.get_pc_thunk.bx
+	.hidden	__i686.get_pc_thunk.bx
+	ALIGN (4)
+	.type	__i686.get_pc_thunk.bx, at function
+__i686.get_pc_thunk.bx:
+	movl	(%esp), %ebx
+	ret
+
+	.section .text.ssse3,"ax", at progbits
+ENTRY (MEMCPY_OR)
+	ENTRANCE
+	movl	LEN(%esp), %ecx
+	movl	SRC(%esp), %eax
+	movl	DEST(%esp), %edx
+
+	cmp	$48, %ecx
+	jae	L(48bytesormore)
+
+	cmp	%dl, %al
+	jb	L(bk_write)
+	add	%ecx, %edx
+	add	%ecx, %eax
+	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
+L(bk_write):
+	BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
+
+	ALIGN (4)
+/* ECX > 32 and EDX is 4 byte aligned.  */
+L(48bytesormore):
+	movdqu	(%eax), %xmm0
+	PUSH (%edi)
+	mov	$0xff000000, %edi
+	movd	%edi, %xmm6
+	movl	%edx, %edi
+	and	$-16, %edx
+	PUSH (%esi)
+	add	$16, %edx
+	movl	%edi, %esi
+	sub	%edx, %edi
+	add	%edi, %ecx
+	sub	%edi, %eax
+
+	mov	%esi, %edi
+	pshufd	$0, %xmm6, %xmm6
+	and	$3, %edi
+	por	%xmm6, %xmm0
+	jz	L(aligned4bytes)
+	cmp	$3, %edi
+	psrldq	$1, %xmm6
+	jz	L(aligned4bytes)
+	cmp	$2, %edi
+	psrldq	$1, %xmm6
+	jz	L(aligned4bytes)
+	psrldq	$1, %xmm6
+L(aligned4bytes):
+	mov	%eax, %edi
+	and	$0xf, %edi
+	jz	L(shl_0)
+	sub	%edi, %eax
+	call	__i686.get_pc_thunk.bx
+	addl	$(L(shl_table)- .), %ebx
+	movaps	(%eax), %xmm1
+	addl	(%ebx,%edi,4), %ebx
+	movdqu	%xmm0, (%esi)
+	jmp	*%ebx
+
+	ALIGN (4)
+L(shl_0):
+	movdqu	%xmm0, (%esi)
+	xor	%edi, %edi
+	POP (%esi)
+	cmp	$127, %ecx
+	ja	L(shl_0_gobble)
+	lea	-32(%ecx), %ecx
+
+	movdqa	(%eax, %edi), %xmm0
+	sub	$32, %ecx
+	movdqa	16(%eax, %edi), %xmm1
+	por	%xmm6, 	%xmm0
+	por	%xmm6, 	%xmm1
+	movdqa	%xmm0, (%edx, %edi)
+	movdqa	%xmm1, 16(%edx, %edi)
+	lea	32(%edi), %edi
+	jb	L(shl_0_end)
+
+	movdqa	(%eax, %edi), %xmm0
+	sub	$32, %ecx
+	movdqa	16(%eax, %edi), %xmm1
+	por	%xmm6, 	%xmm0
+	por	%xmm6, 	%xmm1
+	movdqa	%xmm0, (%edx, %edi)
+	movdqa	%xmm1, 16(%edx, %edi)
+	lea	32(%edi), %edi
+	jb	L(shl_0_end)
+
+	movdqa	(%eax, %edi), %xmm0
+	sub	$32, %ecx
+	movdqa	16(%eax, %edi), %xmm1
+	por	%xmm6, 	%xmm0
+	por	%xmm6, 	%xmm1
+	movdqa	%xmm0, (%edx, %edi)
+	movdqa	%xmm1, 16(%edx, %edi)
+	lea	32(%edi), %edi
+	jb	L(shl_0_end)
+
+	movdqa	(%eax, %edi), %xmm0
+	sub	$32, %ecx
+	movdqa	16(%eax, %edi), %xmm1
+	por	%xmm6, 	%xmm0
+	por	%xmm6, 	%xmm1
+	movdqa	%xmm0, (%edx, %edi)
+	movdqa	%xmm1, 16(%edx, %edi)
+	lea	32(%edi), %edi
+L(shl_0_end):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edi
+	add	%edi, %edx
+	add	%edi, %eax
+	add	$4, %ecx
+	and	$60, %ecx
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+L(shl_0_gobble):
+	POP (%edi)
+	lea	-128(%ecx), %ecx
+L(shl_0_gobble_cache_loop):
+	movdqa	(%eax), %xmm0
+	sub	$128, %ecx
+	movaps	0x10(%eax), %xmm1
+	por	%xmm6, 	%xmm0
+	movaps	0x20(%eax), %xmm2
+	por	%xmm6, 	%xmm1
+	movaps	0x30(%eax), %xmm3
+	por	%xmm6, 	%xmm2
+	movdqa	%xmm0, (%edx)
+	por	%xmm6, 	%xmm3
+	movaps	%xmm1, 0x10(%edx)
+	movaps	%xmm2, 0x20(%edx)
+	movaps	%xmm3, 0x30(%edx)
+	movaps	0x40(%eax), %xmm0
+	lea	0x80(%edx), %edx
+	movaps	0x50(%eax), %xmm1
+	por	%xmm6, 	%xmm0
+	movaps	0x60(%eax), %xmm2
+	por	%xmm6, 	%xmm1
+	movaps	0x70(%eax), %xmm3
+	por	%xmm6, 	%xmm2
+	lea	0x80(%eax), %eax
+	movaps	%xmm0, -0x40(%edx)
+	por	%xmm6, 	%xmm3
+	movaps	%xmm1, -0x30(%edx)
+	movaps	%xmm2, -0x20(%edx)
+	movaps	%xmm3, -0x10(%edx)
+	jae	L(shl_0_gobble_cache_loop)
+	cmp	$-0x40, %ecx
+	lea	0x80(%ecx), %ecx
+	jl	L(shl_0_cache_less_64bytes)
+	movdqa	(%eax), %xmm0
+	sub	$0x40, %ecx
+	movdqa	0x10(%eax), %xmm1
+	por	%xmm6, 	%xmm0
+	movdqa	0x20(%eax), %xmm2
+	por	%xmm6, 	%xmm1
+	movdqa	0x30(%eax), %xmm3
+	por	%xmm6, 	%xmm2
+	movdqa	%xmm0, (%edx)
+	lea	0x40(%eax), %eax
+	movdqa	%xmm1, 0x10(%edx)
+	por	%xmm6, 	%xmm3
+	movdqa	%xmm2, 0x20(%edx)
+	movdqa	%xmm3, 0x30(%edx)
+	lea	0x40(%edx), %edx
+L(shl_0_cache_less_64bytes):
+	cmp	$0x20, %ecx
+	jb	L(shl_0_cache_less_32bytes)
+	movdqa	(%eax), %xmm0
+	sub	$0x20, %ecx
+	movdqa	0x10(%eax), %xmm1
+	por	%xmm6, 	%xmm0
+	lea	0x20(%eax), %eax
+	por	%xmm6, 	%xmm1
+	movdqa	%xmm0, (%edx)
+	movdqa	%xmm1, 0x10(%edx)
+	lea	0x20(%edx), %edx
+L(shl_0_cache_less_32bytes):
+	cmp	$0x10, %ecx
+	jb	L(shl_0_cache_less_16bytes)
+	movdqa	(%eax), %xmm0
+	sub	$0x10, %ecx
+	add	$0x10, %eax
+	por	%xmm6, 	%xmm0
+	movdqa	%xmm0, (%edx)
+	add	$0x10, %edx
+L(shl_0_cache_less_16bytes):
+	add	%ecx, %edx
+	add	%ecx, %eax
+	add	$4, %ecx
+	and	$60, %ecx
+	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
+
+	ALIGN (4)
+L(shl_1):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	lea	-32(%ecx), %ecx
+	POP (%esi)
+	POP (%edi)
+	ALIGN (4)
+L(shl_1_loop):
+	movaps	16(%eax), %xmm2
+	sub	$32, %ecx
+	movaps	32(%eax), %xmm3
+	lea	32(%eax), %eax
+	movdqa	%xmm3, %xmm4
+	palignr	$1, %xmm2, %xmm3
+	lea	32(%edx), %edx
+	palignr	$1, %xmm1, %xmm2
+	por	%xmm6, %xmm2
+	movaps	%xmm2, -32(%edx)
+	por	%xmm6, %xmm3
+	movaps	%xmm3, -16(%edx)
+	jb	L(shl_1_end)
+
+	movaps	16(%eax), %xmm2
+	sub	$32, %ecx
+	movaps	32(%eax), %xmm3
+	lea	32(%eax), %eax
+	movdqa	%xmm3, %xmm1
+	palignr	$1, %xmm2, %xmm3
+	lea	32(%edx), %edx
+	palignr	$1, %xmm4, %xmm2
+	por	%xmm6, %xmm2
+	movaps	%xmm2, -32(%edx)
+	por	%xmm6, %xmm3
+	movaps	%xmm3, -16(%edx)
+	jae	L(shl_1_loop)
+L(shl_1_end):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edx
+	lea	1(%ecx, %eax), %eax
+	add	$4, %ecx
+	and	$60, %ecx
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	ALIGN (4)
+L(shl_2):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	lea	-32(%ecx), %ecx
+	POP (%esi)
+	POP (%edi)
+	ALIGN (4)
+L(shl_2_loop):
+	movaps	16(%eax), %xmm2
+	sub	$32, %ecx
+	movaps	32(%eax), %xmm3
+	lea	32(%eax), %eax
+	movdqa	%xmm3, %xmm4
+	palignr	$2, %xmm2, %xmm3
+	lea	32(%edx), %edx
+	palignr	$2, %xmm1, %xmm2
+	por	%xmm6, %xmm2
+	movaps	%xmm2, -32(%edx)
+	por	%xmm6, %xmm3
+	movaps	%xmm3, -16(%edx)
+	jb	L(shl_2_end)
+
+	movaps	16(%eax), %xmm2
+	sub	$32, %ecx
+	movaps	32(%eax), %xmm3
+	lea	32(%eax), %eax
+	movdqa	%xmm3, %xmm1
+	palignr	$2, %xmm2, %xmm3
+	lea	32(%edx), %edx
+	palignr	$2, %xmm4, %xmm2
+	por	%xmm6, %xmm2
+	movaps	%xmm2, -32(%edx)
+	por	%xmm6, %xmm3
+	movaps	%xmm3, -16(%edx)
+	jae	L(shl_2_loop)
+L(shl_2_end):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edx
+	lea	2(%ecx, %eax), %eax
+	add	$4, %ecx
+	and	$60, %ecx
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	ALIGN (4)
+L(shl_3):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	lea	-32(%ecx), %ecx
+	POP (%esi)
+	POP (%edi)
+	ALIGN (4)
+L(shl_3_loop):
+	movaps	16(%eax), %xmm2
+	sub	$32, %ecx
+	movaps	32(%eax), %xmm3
+	lea	32(%eax), %eax
+	movdqa	%xmm3, %xmm4
+	palignr	$3, %xmm2, %xmm3
+	lea	32(%edx), %edx
+	palignr	$3, %xmm1, %xmm2
+	por	%xmm6, %xmm2
+	movaps	%xmm2, -32(%edx)
+	por	%xmm6, %xmm3
+	movaps	%xmm3, -16(%edx)
+	jb	L(shl_3_end)
+
+	movaps	16(%eax), %xmm2
+	sub	$32, %ecx
+	movaps	32(%eax), %xmm3
+	lea	32(%eax), %eax
+	movdqa	%xmm3, %xmm1
+	palignr	$3, %xmm2, %xmm3
+	lea	32(%edx), %edx
+	palignr	$3, %xmm4, %xmm2
+	por	%xmm6, %xmm2
+	movaps	%xmm2, -32(%edx)
+	por	%xmm6, %xmm3
+	movaps	%xmm3, -16(%edx)
+	jae	L(shl_3_loop)
+L(shl_3_end):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edx
+	lea	3(%ecx, %eax), %eax
+	add	$4, %ecx
+	and	$60, %ecx
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	ALIGN (4)
+L(shl_4):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	lea	-32(%ecx), %ecx
+	POP (%esi)
+	POP (%edi)
+	ALIGN (4)
+L(shl_4_loop):
+	movaps	16(%eax), %xmm2
+	sub	$32, %ecx
+	movaps	32(%eax), %xmm3
+	lea	32(%eax), %eax
+	movdqa	%xmm3, %xmm4
+	palignr	$4, %xmm2, %xmm3
+	lea	32(%edx), %edx
+	palignr	$4, %xmm1, %xmm2
+	por	%xmm6, %xmm2
+	movaps	%xmm2, -32(%edx)
+	por	%xmm6, %xmm3
+	movaps	%xmm3, -16(%edx)
+	jb	L(shl_4_end)
+
+	movaps	16(%eax), %xmm2
+	sub	$32, %ecx
+	movaps	32(%eax), %xmm3
+	lea	32(%eax), %eax
+	movdqa	%xmm3, %xmm1
+	palignr	$4, %xmm2, %xmm3
+	lea	32(%edx), %edx
+	palignr	$4, %xmm4, %xmm2
+	por	%xmm6, %xmm2
+	movaps	%xmm2, -32(%edx)
+	por	%xmm6, %xmm3
+	movaps	%xmm3, -16(%edx)
+	jae	L(shl_4_loop)
+L(shl_4_end):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edx
+	lea	4(%ecx, %eax), %eax
+	add	$4, %ecx
+	and	$60, %ecx
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	ALIGN (4)
+L(shl_5):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	lea	-32(%ecx), %ecx
+	POP (%esi)
+	POP (%edi)
+	ALIGN (4)
+L(shl_5_loop):
+	movaps	16(%eax), %xmm2
+	sub	$32, %ecx
+	movaps	32(%eax), %xmm3
+	lea	32(%eax), %eax
+	movdqa	%xmm3, %xmm4
+	palignr	$5, %xmm2, %xmm3
+	lea	32(%edx), %edx
+	palignr	$5, %xmm1, %xmm2
+	por	%xmm6, %xmm2
+	movaps	%xmm2, -32(%edx)
+	por	%xmm6, %xmm3
+	movaps	%xmm3, -16(%edx)
+	jb	L(shl_5_end)
+
+	movaps	16(%eax), %xmm2
+	sub	$32, %ecx
+	movaps	32(%eax), %xmm3
+	lea	32(%eax), %eax
+	movdqa	%xmm3, %xmm1
+	palignr	$5, %xmm2, %xmm3
+	lea	32(%edx), %edx
+	palignr	$5, %xmm4, %xmm2
+	por	%xmm6, %xmm2
+	movaps	%xmm2, -32(%edx)
+	por	%xmm6, %xmm3
+	movaps	%xmm3, -16(%edx)
+	jae	L(shl_5_loop)
+L(shl_5_end):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edx
+	lea	5(%ecx, %eax), %eax
+	add	$4, %ecx
+	and	$60, %ecx
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	ALIGN (4)
+L(shl_6):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	lea	-32(%ecx), %ecx
+	POP (%esi)
+	POP (%edi)
+	ALIGN (4)
+L(shl_6_loop):
+	movaps	16(%eax), %xmm2
+	sub	$32, %ecx
+	movaps	32(%eax), %xmm3
+	lea	32(%eax), %eax
+	movdqa	%xmm3, %xmm4
+	palignr	$6, %xmm2, %xmm3
+	lea	32(%edx), %edx
+	palignr	$6, %xmm1, %xmm2
+	por	%xmm6, %xmm2
+	movaps	%xmm2, -32(%edx)
+	por	%xmm6, %xmm3
+	movaps	%xmm3, -16(%edx)
+	jb	L(shl_6_end)
+
+	movaps	16(%eax), %xmm2
+	sub	$32, %ecx
+	movaps	32(%eax), %xmm3
+	lea	32(%eax), %eax
+	movdqa	%xmm3, %xmm1
+	palignr	$6, %xmm2, %xmm3
+	lea	32(%edx), %edx
+	palignr	$6, %xmm4, %xmm2
+	por	%xmm6, %xmm2
+	movaps	%xmm2, -32(%edx)
+	por	%xmm6, %xmm3
+	movaps	%xmm3, -16(%edx)
+	jae	L(shl_6_loop)
+L(shl_6_end):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edx
+	lea	6(%ecx, %eax), %eax
+	add	$4, %ecx
+	and	$60, %ecx
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	ALIGN (4)
+L(shl_7):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	lea	-32(%ecx), %ecx
+	POP (%esi)
+	POP (%edi)
+	ALIGN (4)
+L(shl_7_loop):
+	movaps	16(%eax), %xmm2
+	sub	$32, %ecx
+	movaps	32(%eax), %xmm3
+	lea	32(%eax), %eax
+	movdqa	%xmm3, %xmm4
+	palignr	$7, %xmm2, %xmm3
+	lea	32(%edx), %edx
+	palignr	$7, %xmm1, %xmm2
+	por	%xmm6, %xmm2
+	movaps	%xmm2, -32(%edx)
+	por	%xmm6, %xmm3
+	movaps	%xmm3, -16(%edx)
+	jb	L(shl_7_end)
+
+	movaps	16(%eax), %xmm2
+	sub	$32, %ecx
+	movaps	32(%eax), %xmm3
+	lea	32(%eax), %eax
+	movdqa	%xmm3, %xmm1
+	palignr	$7, %xmm2, %xmm3
+	lea	32(%edx), %edx
+	palignr	$7, %xmm4, %xmm2
+	por	%xmm6, %xmm2
+	movaps	%xmm2, -32(%edx)
+	por	%xmm6, %xmm3
+	movaps	%xmm3, -16(%edx)
+	jae	L(shl_7_loop)
+L(shl_7_end):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edx
+	lea	7(%ecx, %eax), %eax
+	add	$4, %ecx
+	and	$60, %ecx
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	ALIGN (4)
+L(shl_8):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	lea	-32(%ecx), %ecx
+	POP (%esi)
+	POP (%edi)
+	ALIGN (4)
+L(shl_8_loop):
+	movaps	16(%eax), %xmm2
+	sub	$32, %ecx
+	movaps	32(%eax), %xmm3
+	lea	32(%eax), %eax
+	movdqa	%xmm3, %xmm4
+	palignr	$8, %xmm2, %xmm3
+	lea	32(%edx), %edx
+	palignr	$8, %xmm1, %xmm2
+	por	%xmm6, %xmm2
+	movaps	%xmm2, -32(%edx)
+	por	%xmm6, %xmm3
+	movaps	%xmm3, -16(%edx)
+	jb	L(shl_8_end)
+
+	movaps	16(%eax), %xmm2
+	sub	$32, %ecx
+	movaps	32(%eax), %xmm3
+	lea	32(%eax), %eax
+	movdqa	%xmm3, %xmm1
+	palignr	$8, %xmm2, %xmm3
+	lea	32(%edx), %edx
+	palignr	$8, %xmm4, %xmm2
+	por	%xmm6, %xmm2
+	movaps	%xmm2, -32(%edx)
+	por	%xmm6, %xmm3
+	movaps	%xmm3, -16(%edx)
+	jae	L(shl_8_loop)
+L(shl_8_end):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edx
+	lea	8(%ecx, %eax), %eax
+	add	$4, %ecx
+	and	$60, %ecx
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	ALIGN (4)
+L(shl_9):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	lea	-32(%ecx), %ecx
+	POP (%esi)
+	POP (%edi)
+	ALIGN (4)
+L(shl_9_loop):
+	movaps	16(%eax), %xmm2
+	sub	$32, %ecx
+	movaps	32(%eax), %xmm3
+	lea	32(%eax), %eax
+	movdqa	%xmm3, %xmm4
+	palignr	$9, %xmm2, %xmm3
+	lea	32(%edx), %edx
+	palignr	$9, %xmm1, %xmm2
+	por	%xmm6, %xmm2
+	movaps	%xmm2, -32(%edx)
+	por	%xmm6, %xmm3
+	movaps	%xmm3, -16(%edx)
+	jb	L(shl_9_end)
+
+	movaps	16(%eax), %xmm2
+	sub	$32, %ecx
+	movaps	32(%eax), %xmm3
+	lea	32(%eax), %eax
+	movdqa	%xmm3, %xmm1
+	palignr	$9, %xmm2, %xmm3
+	lea	32(%edx), %edx
+	palignr	$9, %xmm4, %xmm2
+	por	%xmm6, %xmm2
+	movaps	%xmm2, -32(%edx)
+	por	%xmm6, %xmm3
+	movaps	%xmm3, -16(%edx)
+	jae	L(shl_9_loop)
+L(shl_9_end):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edx
+	lea	9(%ecx, %eax), %eax
+	add	$4, %ecx
+	and	$60, %ecx
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	ALIGN (4)
+L(shl_10):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	lea	-32(%ecx), %ecx
+	POP (%esi)
+	POP (%edi)
+	ALIGN (4)
+L(shl_10_loop):
+	movaps	16(%eax), %xmm2
+	sub	$32, %ecx
+	movaps	32(%eax), %xmm3
+	lea	32(%eax), %eax
+	movdqa	%xmm3, %xmm4
+	palignr	$10, %xmm2, %xmm3
+	lea	32(%edx), %edx
+	palignr	$10, %xmm1, %xmm2
+	por	%xmm6, %xmm2
+	movaps	%xmm2, -32(%edx)
+	por	%xmm6, %xmm3
+	movaps	%xmm3, -16(%edx)
+	jb	L(shl_10_end)
+
+	movaps	16(%eax), %xmm2
+	sub	$32, %ecx
+	movaps	32(%eax), %xmm3
+	lea	32(%eax), %eax
+	movdqa	%xmm3, %xmm1
+	palignr	$10, %xmm2, %xmm3
+	lea	32(%edx), %edx
+	palignr	$10, %xmm4, %xmm2
+	por	%xmm6, %xmm2
+	movaps	%xmm2, -32(%edx)
+	por	%xmm6, %xmm3
+	movaps	%xmm3, -16(%edx)
+	jae	L(shl_10_loop)
+L(shl_10_end):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edx
+	lea	10(%ecx, %eax), %eax
+	add	$4, %ecx
+	and	$60, %ecx
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	ALIGN (4)
+L(shl_11):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	lea	-32(%ecx), %ecx
+	POP (%esi)
+	POP (%edi)
+	ALIGN (4)
+L(shl_11_loop):
+	movaps	16(%eax), %xmm2
+	sub	$32, %ecx
+	movaps	32(%eax), %xmm3
+	lea	32(%eax), %eax
+	movdqa	%xmm3, %xmm4
+	palignr	$11, %xmm2, %xmm3
+	lea	32(%edx), %edx
+	palignr	$11, %xmm1, %xmm2
+	por	%xmm6, %xmm2
+	movaps	%xmm2, -32(%edx)
+	por	%xmm6, %xmm3
+	movaps	%xmm3, -16(%edx)
+	jb	L(shl_11_end)
+
+	movaps	16(%eax), %xmm2
+	sub	$32, %ecx
+	movaps	32(%eax), %xmm3
+	lea	32(%eax), %eax
+	movdqa	%xmm3, %xmm1
+	palignr	$11, %xmm2, %xmm3
+	lea	32(%edx), %edx
+	palignr	$11, %xmm4, %xmm2
+	por	%xmm6, %xmm2
+	movaps	%xmm2, -32(%edx)
+	por	%xmm6, %xmm3
+	movaps	%xmm3, -16(%edx)
+	jae	L(shl_11_loop)
+L(shl_11_end):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edx
+	lea	11(%ecx, %eax), %eax
+	add	$4, %ecx
+	and	$60, %ecx
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	ALIGN (4)
+L(shl_12):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	lea	-32(%ecx), %ecx
+	POP (%esi)
+	POP (%edi)
+	ALIGN (4)
+L(shl_12_loop):
+	movaps	16(%eax), %xmm2
+	sub	$32, %ecx
+	movaps	32(%eax), %xmm3
+	lea	32(%eax), %eax
+	movdqa	%xmm3, %xmm4
+	palignr	$12, %xmm2, %xmm3
+	lea	32(%edx), %edx
+	palignr	$12, %xmm1, %xmm2
+	por	%xmm6, %xmm2
+	movaps	%xmm2, -32(%edx)
+	por	%xmm6, %xmm3
+	movaps	%xmm3, -16(%edx)
+	jb	L(shl_12_end)
+
+	movaps	16(%eax), %xmm2
+	sub	$32, %ecx
+	movaps	32(%eax), %xmm3
+	lea	32(%eax), %eax
+	movdqa	%xmm3, %xmm1
+	palignr	$12, %xmm2, %xmm3
+	lea	32(%edx), %edx
+	palignr	$12, %xmm4, %xmm2
+	por	%xmm6, %xmm2
+	movaps	%xmm2, -32(%edx)
+	por	%xmm6, %xmm3
+	movaps	%xmm3, -16(%edx)
+	jae	L(shl_12_loop)
+L(shl_12_end):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edx
+	lea	12(%ecx, %eax), %eax
+	add	$4, %ecx
+	and	$60, %ecx
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	ALIGN (4)
+L(shl_13):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	lea	-32(%ecx), %ecx
+	POP (%esi)
+	POP (%edi)
+	ALIGN (4)
+L(shl_13_loop):
+	movaps	16(%eax), %xmm2
+	sub	$32, %ecx
+	movaps	32(%eax), %xmm3
+	lea	32(%eax), %eax
+	movdqa	%xmm3, %xmm4
+	palignr	$13, %xmm2, %xmm3
+	lea	32(%edx), %edx
+	palignr	$13, %xmm1, %xmm2
+	por	%xmm6, %xmm2
+	movaps	%xmm2, -32(%edx)
+	por	%xmm6, %xmm3
+	movaps	%xmm3, -16(%edx)
+	jb	L(shl_13_end)
+
+	movaps	16(%eax), %xmm2
+	sub	$32, %ecx
+	movaps	32(%eax), %xmm3
+	lea	32(%eax), %eax
+	movdqa	%xmm3, %xmm1
+	palignr	$13, %xmm2, %xmm3
+	lea	32(%edx), %edx
+	palignr	$13, %xmm4, %xmm2
+	por	%xmm6, %xmm2
+	movaps	%xmm2, -32(%edx)
+	por	%xmm6, %xmm3
+	movaps	%xmm3, -16(%edx)
+	jae	L(shl_13_loop)
+L(shl_13_end):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edx
+	lea	13(%ecx, %eax), %eax
+	add	$4, %ecx
+	and	$60, %ecx
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	ALIGN (4)
+L(shl_14):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	lea	-32(%ecx), %ecx
+	POP (%esi)
+	POP (%edi)
+	ALIGN (4)
+L(shl_14_loop):
+	movaps	16(%eax), %xmm2
+	sub	$32, %ecx
+	movaps	32(%eax), %xmm3
+	lea	32(%eax), %eax
+	movdqa	%xmm3, %xmm4
+	palignr	$14, %xmm2, %xmm3
+	lea	32(%edx), %edx
+	palignr	$14, %xmm1, %xmm2
+	por	%xmm6, %xmm2
+	movaps	%xmm2, -32(%edx)
+	por	%xmm6, %xmm3
+	movaps	%xmm3, -16(%edx)
+	jb	L(shl_14_end)
+
+	movaps	16(%eax), %xmm2
+	sub	$32, %ecx
+	movaps	32(%eax), %xmm3
+	lea	32(%eax), %eax
+	movdqa	%xmm3, %xmm1
+	palignr	$14, %xmm2, %xmm3
+	lea	32(%edx), %edx
+	palignr	$14, %xmm4, %xmm2
+	por	%xmm6, %xmm2
+	movaps	%xmm2, -32(%edx)
+	por	%xmm6, %xmm3
+	movaps	%xmm3, -16(%edx)
+	jae	L(shl_14_loop)
+L(shl_14_end):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edx
+	lea	14(%ecx, %eax), %eax
+	add	$4, %ecx
+	and	$60, %ecx
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	ALIGN (4)
+L(shl_15):
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+	lea	-32(%ecx), %ecx
+	POP (%esi)
+	POP (%edi)
+	ALIGN (4)
+L(shl_15_loop):
+	movaps	16(%eax), %xmm2
+	sub	$32, %ecx
+	movaps	32(%eax), %xmm3
+	lea	32(%eax), %eax
+	movdqa	%xmm3, %xmm4
+	palignr	$15, %xmm2, %xmm3
+	lea	32(%edx), %edx
+	palignr	$15, %xmm1, %xmm2
+	por	%xmm6, %xmm2
+	movaps	%xmm2, -32(%edx)
+	por	%xmm6, %xmm3
+	movaps	%xmm3, -16(%edx)
+	jb	L(shl_15_end)
+
+	movaps	16(%eax), %xmm2
+	sub	$32, %ecx
+	movaps	32(%eax), %xmm3
+	lea	32(%eax), %eax
+	movdqa	%xmm3, %xmm1
+	palignr	$15, %xmm2, %xmm3
+	lea	32(%edx), %edx
+	palignr	$15, %xmm4, %xmm2
+	por	%xmm6, %xmm2
+	movaps	%xmm2, -32(%edx)
+	por	%xmm6, %xmm3
+	movaps	%xmm3, -16(%edx)
+	jae	L(shl_15_loop)
+L(shl_15_end):
+	lea	32(%ecx), %ecx
+	add	%ecx, %edx
+	lea	15(%ecx, %eax), %eax
+	add	$4, %ecx
+	and	$60, %ecx
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+	ALIGN (4)
+L(fwd_write_44bytes):
+	movl	$0xff000000, %ecx
+	or	-44(%eax), %ecx
+	movl	%ecx, -44(%edx)
+L(fwd_write_40bytes):
+	movl	$0xff000000, %ecx
+	or	-40(%eax), %ecx
+	movl	%ecx, -40(%edx)
+L(fwd_write_36bytes):
+	movl	$0xff000000, %ecx
+	or	-36(%eax), %ecx
+	movl	%ecx, -36(%edx)
+L(fwd_write_32bytes):
+	movl	$0xff000000, %ecx
+	or	-32(%eax), %ecx
+	movl	%ecx, -32(%edx)
+L(fwd_write_28bytes):
+	movl	$0xff000000, %ecx
+	or	-28(%eax), %ecx
+	movl	%ecx, -28(%edx)
+L(fwd_write_24bytes):
+	movl	$0xff000000, %ecx
+	or	-24(%eax), %ecx
+	movl	%ecx, -24(%edx)
+L(fwd_write_20bytes):
+	movl	$0xff000000, %ecx
+	or	-20(%eax), %ecx
+	movl	%ecx, -20(%edx)
+L(fwd_write_16bytes):
+	movl	$0xff000000, %ecx
+	or	-16(%eax), %ecx
+	movl	%ecx, -16(%edx)
+L(fwd_write_12bytes):
+	movl	$0xff000000, %ecx
+	or	-12(%eax), %ecx
+	movl	%ecx, -12(%edx)
+L(fwd_write_8bytes):
+	movl	$0xff000000, %ecx
+	or	-8(%eax), %ecx
+	movl	%ecx, -8(%edx)
+L(fwd_write_4bytes):
+	movl	$0xff000000, %ecx
+	or	-4(%eax), %ecx
+	movl	%ecx, -4(%edx)
+L(fwd_write_0bytes):
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+# else
+	movl	DEST(%esp), %eax
+# endif
+#endif
+
+	RETURN
+
+	ALIGN (4)
+L(bk_write_44bytes):
+	movl	$0xff000000, %ecx
+	or	40(%eax), %ecx
+	movl	%ecx, 40(%edx)
+L(bk_write_40bytes):
+	movl	$0xff000000, %ecx
+	or	36(%eax), %ecx
+	movl	%ecx, 36(%edx)
+L(bk_write_36bytes):
+	movl	$0xff000000, %ecx
+	or	32(%eax), %ecx
+	movl	%ecx, 32(%edx)
+L(bk_write_32bytes):
+	movl	$0xff000000, %ecx
+	or	28(%eax), %ecx
+	movl	%ecx, 28(%edx)
+L(bk_write_28bytes):
+	movl	$0xff000000, %ecx
+	or	24(%eax), %ecx
+	movl	%ecx, 24(%edx)
+L(bk_write_24bytes):
+	movl	$0xff000000, %ecx
+	or	20(%eax), %ecx
+	movl	%ecx, 20(%edx)
+L(bk_write_20bytes):
+	movl	$0xff000000, %ecx
+	or	16(%eax), %ecx
+	movl	%ecx, 16(%edx)
+L(bk_write_16bytes):
+	movl	$0xff000000, %ecx
+	or	12(%eax), %ecx
+	movl	%ecx, 12(%edx)
+L(bk_write_12bytes):
+	movl	$0xff000000, %ecx
+	or	8(%eax), %ecx
+	movl	%ecx, 8(%edx)
+L(bk_write_8bytes):
+	movl	$0xff000000, %ecx
+	or	4(%eax), %ecx
+	movl	%ecx, 4(%edx)
+L(bk_write_4bytes):
+	movl	$0xff000000, %ecx
+	or	(%eax), %ecx
+	movl	%ecx, (%edx)
+L(bk_write_0bytes):
+#ifndef USE_AS_BCOPY
+	movl	DEST(%esp), %eax
+# ifdef USE_AS_MEMPCPY
+	movl	LEN(%esp), %ecx
+	add	%ecx, %eax
+# endif
+#endif
+	RETURN
+
+
+	.pushsection .rodata.ssse3,"a", at progbits
+	ALIGN (2)
+L(table_48bytes_fwd):
+	.int	JMPTBL (L(fwd_write_0bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_4bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_8bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_12bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_16bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_20bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_24bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_28bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_32bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_36bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_40bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_44bytes), L(table_48bytes_fwd))
+
+	ALIGN (2)
+L(shl_table):
+	.int	JMPTBL (L(shl_0), L(shl_table))
+	.int	JMPTBL (L(shl_1), L(shl_table))
+	.int	JMPTBL (L(shl_2), L(shl_table))
+	.int	JMPTBL (L(shl_3), L(shl_table))
+	.int	JMPTBL (L(shl_4), L(shl_table))
+	.int	JMPTBL (L(shl_5), L(shl_table))
+	.int	JMPTBL (L(shl_6), L(shl_table))
+	.int	JMPTBL (L(shl_7), L(shl_table))
+	.int	JMPTBL (L(shl_8), L(shl_table))
+	.int	JMPTBL (L(shl_9), L(shl_table))
+	.int	JMPTBL (L(shl_10), L(shl_table))
+	.int	JMPTBL (L(shl_11), L(shl_table))
+	.int	JMPTBL (L(shl_12), L(shl_table))
+	.int	JMPTBL (L(shl_13), L(shl_table))
+	.int	JMPTBL (L(shl_14), L(shl_table))
+	.int	JMPTBL (L(shl_15), L(shl_table))
+
+	ALIGN (2)
+L(table_48_bytes_bwd):
+	.int	JMPTBL (L(bk_write_0bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_4bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_8bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_12bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_16bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_20bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_24bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_28bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_32bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_36bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_40bytes), L(table_48_bytes_bwd))
+	.int	JMPTBL (L(bk_write_44bytes), L(table_48_bytes_bwd))
+
+	.popsection
+
+END (MEMCPY_OR)
diff --git a/pixman/pixman-access.c b/pixman/pixman-access.c
index 56de711..db085b6 100644
--- a/pixman/pixman-access.c
+++ b/pixman/pixman-access.c
@@ -113,10 +113,16 @@ fetch_scanline_x8r8g8b8 (pixman_image_t *image,
 {
     const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
     const uint32_t *pixel = (const uint32_t *)bits + x;
+
+#if defined(USE_SSSE3) && !defined(PIXMAN_FB_ACCESSORS)
+    extern void *fetch_scanline_x8r8g8b8_ssse3_fast_path(void *dest, const void *src, int count);
+    fetch_scanline_x8r8g8b8_ssse3_fast_path(buffer, pixel, width * 4);
+#else
     const uint32_t *end = pixel + width;
     
     while (pixel < end)
 	*buffer++ = READ (image, pixel++) | 0xff000000;
+#endif
 }
 
 static void
-- 
1.7.0.4



More information about the Pixman mailing list