[PATCH] [ssse3] Optimization for fetch_scanline_x8r8g8b8

Wed Dec 8 06:19:50 PST 2010

Add x8888 ssse3 optimization.

Signed-off-by: Xu Samuel <samuel.xu at intel.com>
Signed-off-by: Ma Ling <ling.ma at intel.com>
Signed-off-by: Zhao Yakui <yakui.zhao at intel.com>
---
 pixman/Makefile.am            |    4 +-
 pixman/pixman-ssse3-x86-asm.S |  255 +++++++++++++++++++
 pixman/pixman-ssse3-x86-asm.h |  552 +++++++++++++++++++++++++++++++++++++++++
 pixman/pixman-ssse3.c         |   47 ++++
 4 files changed, 857 insertions(+), 1 deletions(-)
 create mode 100755 pixman/pixman-ssse3-x86-asm.S
 create mode 100755 pixman/pixman-ssse3-x86-asm.h

diff --git a/pixman/Makefile.am b/pixman/Makefile.am
index ba6810c..c6a731c 100644
--- a/pixman/Makefile.am
+++ b/pixman/Makefile.am
@@ -98,7 +98,8 @@ endif
 if USE_SSSE3
 noinst_LTLIBRARIES += libpixman-ssse3.la
 libpixman_ssse3_la_SOURCES = \
-	pixman-ssse3.c
+	pixman-ssse3.c \
+	pixman-ssse3-x86-asm.S
 libpixman_ssse3_la_CFLAGS = $(DEP_CFLAGS) $(SSSE3_CFLAGS)
 libpixman_ssse3_la_LIBADD = $(DEP_LIBS)
 libpixman_1_la_LDFLAGS += $(SSSE3_LDFLAGS)
@@ -106,6 +107,7 @@ libpixman_1_la_LIBADD += libpixman-ssse3.la
 
 ASM_CFLAGS_ssse3=$(SSSE3_CFLAGS)
 endif
+
 # arm simd code
 if USE_ARM_SIMD
 noinst_LTLIBRARIES += libpixman-arm-simd.la
diff --git a/pixman/pixman-ssse3-x86-asm.S b/pixman/pixman-ssse3-x86-asm.S
new file mode 100755
index 0000000..c9b187e
--- /dev/null
+++ b/pixman/pixman-ssse3-x86-asm.S
@@ -0,0 +1,255 @@
+/*
+ * Copyright 2010 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Author:  Ma Ling (ling.ma at intel.com)
+ * Author:  Xu, Samuel (samuel.xu at intel.com)
+ * Author:  Yakui, Zhao (yakui.zhao at intel.com)
+ */
+#include "pixman-ssse3-x86-asm.h"
+
+
+    .section .note.GNU-stack
+    .previous
+
+#if (!defined(__amd64__) && !defined(__x86_64__))
+
+	.section	.gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax", at progbits
+	.globl	__i686.get_pc_thunk.bx
+	.hidden	__i686.get_pc_thunk.bx
+	ALIGN (4)
+	.type	__i686.get_pc_thunk.bx, at function
+__i686.get_pc_thunk.bx:
+	movl	(%esp), %ebx
+	ret
+#endif
+	.section .text.ssse3,"ax", at progbits
+
+ENTRY(composite_line_src_x888_8888_ssse3)
+	/* This is meaningless on 64-bit. But on 32-bit system,
+	 * it saves EBX register and get the input argument.
+	 */
+	ENTRANCE;
+	/* check whether the copy count is >= 48.
+	 * if the copy count is >=48, goto 48bytesormore and use
+	 * the XMM register to copy data. Otherwise the general
+	 * purpose register is used.
+	 */
+	CMP_COPY_LENGTH $48;
+	jae L(48bytesormore);
+	/*
+	 * When the copy length is less than 48, we will use the general-purpose
+	 * register to copy the pixel data.
+	 */
+	GOTO_FWD_COPY;
+
+	ALIGN (4)
+/* ECX > 32 and EDX is 4 byte aligned.  */
+L(48bytesormore):
+	/* Check whether the source address is aligned with the destination
+	 * address at 16-bytes boundardy. If it is aligned, we will try to
+	 * use 128-bytes loop copy mode. If they are not aligned, the
+	 * packed-align copy mode will be used for the different unaligned
+	 * length. For example: 4/8/12
+	 * It is noted that the following src/dest will also be regarded as
+	 * aligned after we handle the first 16-bytes.
+	 * SRC: 0x48000005 , DEST: 0x49000025
+	 */
+	SHL_COPY_PREPROCESS;
+	/* If it is aligned, use the 128-bytes loop copy mode */
+	jz L(shl_0);
+	/* calculate the different unaligned length and then use the
+	 * unaligned copy mode. The possible unaligned length
+	 * is 4/8/12
+	 */
+	GOTO_UNALIGNED_COPY;
+
+	ALIGN (4)
+L(shl_0):
+	/* Compare whether the length is above 127bytes. If not,
+	 * try to use 32-bytes copy mode several times and then handle the
+	 * left length. For example: if the length is 100, it will use 32-bytes
+	 * copy three times and then handle the left 4 bytes by using
+	 * general-purpose register to do forward-copy mode.
+	 */
+	SHL0_COPY_PREPROCESS;
+	CMP_COPY_LENGTH $127
+	/* If the length is >= 128bytes, we will use 128-byte loop-copy mode */
+	ja L(shl_0_gobble);
+	DEC_COPY_LENGTH 32;
+
+	/* Copy 32-bytes */
+	SHL0_COPY_32BYTES;
+	jb	L(shl_0_end)
+
+	SHL0_COPY_32BYTES;
+	jb	L(shl_0_end)
+
+	SHL0_COPY_32BYTES;
+	jb	L(shl_0_end)
+
+	SHL0_COPY_32BYTES;
+L(shl_0_end):
+	/* handle the left length, which can't be handled by 32-byte
+	 * copy mode.
+	 */
+	SHL0_COPY_POSTPROCESS;
+
+L(shl_0_gobble):
+	SHL0_GOBBLE_PREPROCESS;
+	DEC_COPY_LENGTH 128
+	/* Use the 128-byte loop-copy mode. The si/di/cx will be updated. Every
+	 * loop will try to copy 128bytes by using XMM register */
+L(shl_0_gobble_cache_loop):
+	SHL0_COPY_128BYTES;
+
+	jae	L(shl_0_gobble_cache_loop)
+
+	/* If the left length is < 128, it will compare the length with 64/32/16
+	 * then handle the corresponding length data */
+	CMP_COPY_LENGTH $-0x40;
+	INC_COPY_LENGTH 0x80;
+	jl L(shl_0_cache_less_64bytes);
+	/* Copy 64bytes when length is >=64 */
+	HANDLE_64BYTES;
+L(shl_0_cache_less_64bytes):
+	CMP_COPY_LENGTH $32;
+	jb L(shl_0_cache_less_32bytes);
+	/* Copy 32bytes when length is >=32 */
+	HANDLE_32BYTES;
+L(shl_0_cache_less_32bytes):
+	CMP_COPY_LENGTH $16;
+	/* Copy 16bytes when length is >=16 */
+	jb L(shl_0_cache_less_16bytes);
+	HANDLE_16BYTES;
+
+L(shl_0_cache_less_16bytes):
+	/* Use the general-purpose register to copy when the left length
+	 * is < 16
+	 */
+	HANDLE_LESS_16BYTES;
+
+	ALIGN(4)
+	/* The unaligned length is 4 */
+L(shl_4):
+	SHL_PREPROCESS;
+	ALIGN(4)
+	/* Use the two-stage 32bytes copy. And the 4-unaligned length
+	 * is considered in course of copy. It is noted that two
+	 * 16-bytes XMM register will be packed into one 16-byte
+	 * XMM register.
+	 */
+L(shl_4_loop):
+	SHL_COPY_STAGE_ONE $4
+	jb	L(shl_4_end)
+
+	SHL_COPY_STAGE_TWO $4
+	jae	L(shl_4_loop)
+L(shl_4_end):
+	/* Copy the left length */
+	SHL_POSTPROCESS(4);
+
+	ALIGN (4)
+	/* The unaligned length is 8 */
+L(shl_8):
+	SHL_PREPROCESS;
+	ALIGN(4)
+L(shl_8_loop):
+	SHL_COPY_STAGE_ONE $8
+	jb	L(shl_8_end)
+
+	SHL_COPY_STAGE_TWO $8
+	jae	L(shl_8_loop)
+L(shl_8_end):
+	SHL_POSTPROCESS(8);
+
+	/* The unaligned length is 12 */
+	ALIGN(4)
+L(shl_12):
+	SHL_PREPROCESS;
+	ALIGN(4)
+
+L(shl_12_loop):
+	SHL_COPY_STAGE_ONE $12
+	jb	L(shl_12_end)
+
+	SHL_COPY_STAGE_TWO $12
+
+	jae	L(shl_12_loop)
+L(shl_12_end):
+	SHL_POSTPROCESS(12);
+
+/* Forward copy for the length < 48 */
+	ALIGN (4)
+L(fwd_write_44bytes):
+	fwd_write_bytes 44
+L(fwd_write_40bytes):
+	fwd_write_bytes 40
+L(fwd_write_36bytes):
+	fwd_write_bytes 36
+L(fwd_write_32bytes):
+	fwd_write_bytes 32
+L(fwd_write_28bytes):
+	fwd_write_bytes 28
+L(fwd_write_24bytes):
+	fwd_write_bytes 24
+L(fwd_write_20bytes):
+	fwd_write_bytes 20
+L(fwd_write_16bytes):
+	fwd_write_bytes 16
+L(fwd_write_12bytes):
+	fwd_write_bytes 12
+L(fwd_write_8bytes):
+	fwd_write_bytes 8
+L(fwd_write_4bytes):
+	fwd_write_bytes 4
+L(fwd_write_0bytes):
+	FWD_WRITE_0BYTES;
+
+
+/* the Jump table for the forward copy with different length.*/
+	.pushsection .rodata.ssse3,"a", at progbits
+	ALIGN (2)
+L(table_48bytes_fwd):
+	.int	JMPTBL (L(fwd_write_0bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_4bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_8bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_12bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_16bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_20bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_24bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_28bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_32bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_36bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_40bytes), L(table_48bytes_fwd))
+	.int	JMPTBL (L(fwd_write_44bytes), L(table_48bytes_fwd))
+
+/* the Jump table for the copy with different unalign case.*/
+	ALIGN (2)
+L(shl_table):
+	.int	JMPTBL (L(shl_0), L(shl_table))
+	.int	JMPTBL (L(shl_4), L(shl_table))
+	.int	JMPTBL (L(shl_8), L(shl_table))
+	.int	JMPTBL (L(shl_12), L(shl_table))
+
+	.popsection
+
+END(composite_line_src_x888_8888_ssse3)
diff --git a/pixman/pixman-ssse3-x86-asm.h b/pixman/pixman-ssse3-x86-asm.h
new file mode 100755
index 0000000..d4a50ef
--- /dev/null
+++ b/pixman/pixman-ssse3-x86-asm.h
@@ -0,0 +1,552 @@
+/*
+ * Copyright 2010 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Author:  Ma Ling (ling.ma at intel.com)
+ * Author:  Xu, Samuel (samuel.xu at intel.com)
+ * Author:  Yakui, Zhao (yakui.zhao at intel.com)
+ *
+ */
+
+#ifndef L
+#define L(label)			.L##label
+#endif
+
+/* the align macro, which will be aligned to 2^n bytes boundary */
+#ifndef ALIGN
+# define ALIGN(n)			.p2align n
+#endif
+
+#ifndef cfi_startproc
+# define cfi_startproc			.cfi_startproc
+#endif
+
+#ifndef cfi_endproc
+# define cfi_endproc			.cfi_endproc
+#endif
+
+#ifndef cfi_rel_offset
+# define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
+#endif
+
+#ifndef cfi_restore
+# define cfi_restore(reg)		.cfi_restore reg
+#endif
+
+#ifndef cfi_adjust_cfa_offset
+# define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
+#endif
+
+#ifndef cfi_remember_state
+# define cfi_remember_state		.cfi_remember_state
+#endif
+
+#ifndef cfi_restore_state
+# define cfi_restore_state		.cfi_restore_state
+#endif
+
+#ifndef ENTRY
+# define ENTRY(name)			\
+	.type name,  @function; 	\
+	.globl name;			\
+	.p2align 4;			\
+name:					\
+	cfi_startproc
+#endif
+
+#ifndef END
+# define END(name)			\
+	cfi_endproc;			\
+	.size name, .-name
+#endif
+
+#define JMPTBL(I, B)	I - B
+
+/* the macro definition of 128-bytes copy. It is only for the aligned case.
+ * The si/di/cx register will also be updated after copy
+ * It is noted that the si/di should be algined at 16-bytes boundardy when
+ * the movdqa/movaps instruction is used.
+ */
+.macro ALIGN_COPY_128BYTES si, di, cx
+	movdqa	(\si), %xmm0
+	sub	$128, \cx
+	movaps	0x10(\si), %xmm1
+	por	%xmm6, 	%xmm0
+	movaps	0x20(\si), %xmm2
+	por	%xmm6, 	%xmm1
+	movaps	0x30(\si), %xmm3
+	por	%xmm6, 	%xmm2
+	movdqa	%xmm0, (\di)
+	por	%xmm6, 	%xmm3
+	movaps	%xmm1, 0x10(\di)
+	movaps	%xmm2, 0x20(\di)
+	movaps	%xmm3, 0x30(\di)
+	movaps	0x40(\si), %xmm0
+	lea	0x80(\di), \di
+	movaps	0x50(\si), %xmm1
+	por	%xmm6, 	%xmm0
+	movaps	0x60(\si), %xmm2
+	por	%xmm6, 	%xmm1
+	movaps	0x70(\si), %xmm3
+	por	%xmm6, 	%xmm2
+	lea	0x80(\si), \si
+	movaps	%xmm0, -0x40(\di)
+	por	%xmm6, 	%xmm3
+	movaps	%xmm1, -0x30(\di)
+	movaps	%xmm2, -0x20(\di)
+	movaps	%xmm3, -0x10(\di)
+.endm
+
+/* the macro definition of 64-bytes copy. It is only for the aligned case.
+ * The si/di/cx register will also be updated after copy
+ */
+.macro ALIGN_COPY_64BYTES si, di, cx
+	movdqa	(\si), %xmm0
+	sub	$0x40, \cx
+	movdqa	0x10(\si), %xmm1
+	por	%xmm6, 	%xmm0
+	movdqa	0x20(\si), %xmm2
+	por	%xmm6, 	%xmm1
+	movdqa	0x30(\si), %xmm3
+	por	%xmm6, 	%xmm2
+	movdqa	%xmm0, (\di)
+	lea	0x40(\si), \si
+	movdqa	%xmm1, 0x10(\di)
+	por	%xmm6, 	%xmm3
+	movdqa	%xmm2, 0x20(\di)
+	movdqa	%xmm3, 0x30(\di)
+	lea	0x40(\di), \di
+.endm
+
+/* the macro definition of 32-bytes copy. It is only for the aligned case.
+ * The si/di/cx register will also be updated after copy
+ */
+.macro ALIGN_COPY_32BYTES si, di, cx
+	movdqa	(\si), %xmm0
+	sub	$0x20, \cx
+	movdqa	0x10(\si), %xmm1
+	por	%xmm6, 	%xmm0
+	lea	0x20(\si), \si
+	por	%xmm6, 	%xmm1
+	movdqa	%xmm0, (\di)
+	movdqa	%xmm1, 0x10(\di)
+	lea	0x20(\di), \di
+.endm
+
+/* the macro definition of 16-bytes copy. It is only for the aligned case.
+ * The si/di/cx register will also be updated after copy
+ */
+.macro ALIGN_COPY_16BYTES si, di, cx
+	movdqa	(\si), %xmm0
+	sub	$0x10, \cx
+	add	$0x10, \si
+	por	%xmm6, 	%xmm0
+	movdqa	%xmm0, (\di)
+	add	$0x10, \di
+.endm
+
+#if (defined(__amd64__) || defined(__x86_64__))
+
+#define ENTRANCE
+
+/* Branch to an entry in a jump table.  TABLE is a jump table with
+   relative offsets.  INDEX is a register contains the index into the
+   jump table.  SCALE is the scale of INDEX.  */
+#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
+	shr		$2, INDEX;				\
+	lea		TABLE(%rip), %r11;			\
+	movslq	(%r11, INDEX, SCALE), INDEX;			\
+	lea		(%r11, INDEX), INDEX;			\
+	jmp		*INDEX;					\
+	ud2
+
+/* the macro definition of forward copy for the remaining small bytes.*/
+.macro fwd_write_bytes x
+	movl	$0xff000000, %ecx
+	or	-\x(%rsi), %ecx
+	movl	%ecx, -\x(%rdi)
+.endm
+
+/* the macro definition of shift copy in stage one . 32bytes.
+ * Two 16-bytes XMM register will be packed into another 16-byte by using palignr
+ * instruction. */
+.macro SHL_COPY_STAGE_ONE x
+	movaps	16(%rsi), %xmm2
+	sub	$32, %rdx
+	movaps	32(%rsi), %xmm3
+	lea	32(%rsi), %rsi
+	movdqa	%xmm3, %xmm4
+	palignr	\x, %xmm2, %xmm3
+	lea	32(%rdi), %rdi
+	palignr	\x, %xmm1, %xmm2
+	por	%xmm6, %xmm2
+	movaps	%xmm2, -32(%rdi)
+	por	%xmm6, %xmm3
+	movaps	%xmm3, -16(%rdi)
+.endm
+
+/* the macro definition of shift copy in stage two . 32bytes.
+ * Two 16-bytes XMM register will be packed into another 16-byte by using palignr
+ * instruction. */
+.macro SHL_COPY_STAGE_TWO x
+	movaps	16(%rsi), %xmm2
+	sub	$32, %rdx
+	movaps	32(%rsi), %xmm3
+	lea	32(%rsi), %rsi
+	movdqa	%xmm3, %xmm1
+	palignr	\x, %xmm2, %xmm3
+	lea	32(%rdi), %rdi
+	palignr	\x, %xmm4, %xmm2
+	por	%xmm6, %xmm2
+	movaps	%xmm2, -32(%rdi)
+	por	%xmm6, %xmm3
+	movaps	%xmm3, -16(%rdi)
+.endm
+
+/* the macro definition of INC/DEC copy-length register */
+.macro DEC_COPY_LENGTH x
+	lea	-\x(%rdx), %rdx;
+.endm
+
+.macro INC_COPY_LENGTH x
+	lea	\x(%rdx), %rdx;
+.endm
+
+/* Compare the length with the given length */
+.macro CMP_COPY_LENGTH x
+	cmp	\x, %rdx
+.endm
+
+/* The definition of forward-copy mode.
+ * It will jump to the corresponding function based on the copy-length
+ */
+
+
+#define GOTO_FWD_COPY			\
+	add	%rdx, %rsi;		\
+	add	%rdx, %rdi;		\
+	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %rdx, 4)
+
+/* Round up the dest to 16-aligned address boundary and then
+ * calculate whether the src is aligned with the dest
+ */
+#define SHL_COPY_PREPROCESS		\
+	movdqu	(%rsi), %xmm0;		\
+	mov	$0xff000000, %ecx;	\
+	movd	%ecx, %xmm6;		\
+	mov	%rdi, %r8;		\
+	and	$-16, %rdi;		\
+	add	$16, %rdi;		\
+	mov	%r8, %rcx;		\
+	sub	%rdi, %rcx;		\
+	add	%rcx, %rdx;		\
+	sub	%rcx, %rsi;		\
+					\
+	mov	%rsi, %rcx;		\
+	pshufd	$0, %xmm6, %xmm6;	\
+	and	$0xf, %rcx;		\
+	por	%xmm6, %xmm0;
+
+/* Caculate the unaligned-length and use the unaligned-copy mode.
+ * The unaligned-length should be considered when packing two 16-bytes into
+ * another 16-bytes by using palignr instruction */
+#define GOTO_UNALIGNED_COPY		\
+	sub	%rcx, %rsi;		\
+	lea	L(shl_table)(%rip), %r11;	\
+	shr	$2, %rcx;		\
+	movaps	(%rsi), %xmm1;		\
+	movdqu	%xmm0, (%r8);		\
+	movslq	(%r11, %rcx, 4), %rcx;	\
+	lea	(%r11, %rcx), %rcx;	\
+	jmp	*%rcx;			\
+	ud2;				\
+
+/* The macro definition is to make preparation for aligned-copy.
+	 * Write the first 16-byte.(Maybe it is not on the 16-byte boundary).
+ */
+#define SHL0_COPY_PREPROCESS		\
+	movdqu	%xmm0, (%r8);		\
+	xor	%ecx, %ecx;
+
+/* the following two macro definitions are used to copy the
+ * data when the length is less than 128. Every time it will
+ * try to copy 32-bytes.
+ */
+#define SHL0_COPY_32BYTES			\
+	movdqa	(%rsi, %rcx), %xmm0;		\
+	sub	$32, %rdx;			\
+	movdqa	16(%rsi, %rcx), %xmm1;		\
+	por	%xmm6, 	%xmm0;			\
+	por	%xmm6, 	%xmm1;			\
+	movdqa	%xmm0, (%rdi, %rcx);		\
+	movdqa	%xmm1, 16(%rdi, %rcx);		\
+	lea	32(%rcx), %rcx;
+
+/* when the left length is less than 32, Use the forward-copy */
+#define SHL0_COPY_POSTPROCESS			\
+	lea	32(%rdx), %rdx;			\
+	add	%rdx, %rcx;			\
+	add	%rcx, %rsi;			\
+	add	%rcx, %rdi;			\
+	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %rdx, 4)
+
+
+#define SHL0_COPY_128BYTES			\
+	ALIGN_COPY_128BYTES %rsi, %rdi, %rdx
+
+#define SHL0_GOBBLE_PREPROCESS
+
+#define HANDLE_64BYTES			\
+	ALIGN_COPY_64BYTES %rsi, %rdi, %rdx
+
+#define HANDLE_32BYTES			\
+	ALIGN_COPY_32BYTES %rsi, %rdi, %rdx
+
+#define HANDLE_16BYTES			\
+	ALIGN_COPY_16BYTES %rsi, %rdi, %rdx
+
+/* When the left length is < 16 */
+#define HANDLE_LESS_16BYTES			\
+	add	%rdx, %rsi;			\
+	add	%rdx, %rdi;			\
+	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %rdx, 4)
+
+/* the definition of PREPROCESS for unaligned case. 4, 8, 12 */
+#define SHL_PREPROCESS			\
+	lea	-32(%rdx), %rdx;	\
+
+/* the definition of POSTPROCESS for unaligned case. 4, 8, 12 */
+#define SHL_POSTPROCESS(x)			\
+	lea	32(%rdx), %rdx;			\
+	lea	x(%rsi, %rdx), %rsi;		\
+	add	%rdx, %rdi;			\
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %rdx, 4)
+
+/* This is used when the copy is alread finished */
+#define FWD_WRITE_0BYTES			\
+	ret
+
+#else
+/* the following is the macro definition on 32-bits */
+# define PARMS		8		/* Preserve EBX.  */
+# define DEST		PARMS
+# define SRC		DEST+4
+# define LEN		SRC+4
+
+#define CFI_PUSH(REG)						\
+  cfi_adjust_cfa_offset (4);					\
+  cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG)						\
+  cfi_adjust_cfa_offset (-4);					\
+  cfi_restore (REG)
+
+#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+#define POP(REG)	popl REG; CFI_POP (REG)
+
+# define RETURN_END	POP (%ebx); ret
+# define RETURN		RETURN_END; CFI_PUSH (%ebx)
+
+#define ENTRANCE			\
+	PUSH (%ebx);			\
+	movl	LEN(%esp), %ecx;	\
+	movl	SRC(%esp), %eax;	\
+	movl	DEST(%esp), %edx;
+
+/* Load an entry in a jump table into EBX and branch to it.  TABLE is a
+   jump table with relative offsets.  INDEX is a register contains the
+   index into the jump table.   SCALE is the scale of INDEX. */
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
+    /* We first load PC into EBX.  */				\
+    call	__i686.get_pc_thunk.bx;				\
+    /* Get the address of the jump table.  */			\
+    addl	$(TABLE - .), %ebx;				\
+    shr		$2, INDEX;				\
+    /* Get the entry and convert the relative offset to the	\
+       absolute address.  */					\
+    addl	(%ebx,INDEX,SCALE), %ebx;			\
+    /* We loaded the jump table.  Go.  */			\
+    jmp		*%ebx
+
+# define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE)			\
+    addl	$(TABLE - .), %ebx
+
+# define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE)	\
+    shr		$2, INDEX;					\
+    addl	(%ebx,INDEX,SCALE), %ebx;			\
+    /* We loaded the jump table.  Go.  */			\
+    jmp		*%ebx
+
+.macro DEC_COPY_LENGTH x
+	lea	-\x(%ecx), %ecx;
+.endm
+
+.macro INC_COPY_LENGTH x
+	lea 	\x(%ecx), %ecx
+.endm
+
+.macro CMP_COPY_LENGTH x
+	cmp	\x, %ecx
+.endm
+
+
+#define GOTO_FWD_COPY			\
+	add	%ecx, %edx;		\
+	add	%ecx, %eax;		\
+	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
+
+
+
+#define SHL_COPY_PREPROCESS		\
+	movdqu	(%eax), %xmm0;		\
+	PUSH (%edi);			\
+	mov	$0xff000000, %edi;	\
+	movd	%edi, %xmm6;		\
+	movl	%edx, %edi;		\
+	and	$-16, %edx;		\
+	PUSH (%esi);			\
+	add	$16, %edx;		\
+	movl	%edi, %esi;		\
+	sub	%edx, %edi;		\
+	add	%edi, %ecx;		\
+	sub	%edi, %eax;		\
+					\
+	mov	%eax, %edi;		\
+	pshufd	$0, %xmm6, %xmm6;	\
+	and	$0xf, %edi;		\
+	por	%xmm6, %xmm0;		\
+
+#define GOTO_UNALIGNED_COPY		\
+	sub	%edi, %eax;		\
+	call	__i686.get_pc_thunk.bx;	\
+	addl	$(L(shl_table)- .), %ebx;	\
+	shr	$2, %edi;		\
+	movaps	(%eax), %xmm1;		\
+	addl	(%ebx,%edi,4), %ebx;	\
+	movdqu	%xmm0, (%esi);		\
+	jmp	*%ebx;
+
+#define SHL0_COPY_PREPROCESS		\
+	movdqu	%xmm0, (%esi);		\
+	xor	%edi, %edi;		\
+	POP (%esi);
+
+#define SHL0_COPY_32BYTES		\
+	movdqa	(%eax, %edi), %xmm0;	\
+	sub	$32, %ecx;		\
+	movdqa	16(%eax, %edi), %xmm1;	\
+	por	%xmm6, 	%xmm0;		\
+	por	%xmm6, 	%xmm1;		\
+	movdqa	%xmm0, (%edx, %edi);	\
+	movdqa	%xmm1, 16(%edx, %edi);	\
+	lea	32(%edi), %edi;		\
+
+#define SHL0_COPY_POSTPROCESS			\
+	lea	32(%ecx), %ecx;			\
+	add	%ecx, %edi;			\
+	add	%edi, %edx;			\
+	add	%edi, %eax;			\
+	add	$4, %ecx;			\
+	and	$60, %ecx;			\
+	POP (%edi);				\
+	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
+
+
+#define SHL0_GOBBLE_PREPROCESS			\
+	POP(%edi)
+
+#define SHL0_COPY_128BYTES			\
+	ALIGN_COPY_128BYTES %eax, %edx, %ecx
+
+#define HANDLE_64BYTES			\
+	ALIGN_COPY_64BYTES  %eax, %edx, %ecx
+
+#define HANDLE_32BYTES			\
+	ALIGN_COPY_32BYTES  %eax, %edx, %ecx
+
+#define HANDLE_16BYTES			\
+	ALIGN_COPY_16BYTES  %eax, %edx, %ecx
+
+#define HANDLE_LESS_16BYTES			\
+	add	%ecx, %edx;			\
+	add	%ecx, %eax;			\
+	add	$4, %ecx;			\
+	and	$60, %ecx;			\
+	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
+
+/* the definition of PREPROCESS for unaligned case. 4, 8, 12 */
+#define SHL_PREPROCESS			\
+	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd));	\
+	lea	-32(%ecx), %ecx;				\
+	POP (%esi);						\
+	POP (%edi);						\
+
+/* the definition of POSTPROCESS for unaligned case. 4, 8, 12 */
+#define SHL_POSTPROCESS(x)			\
+	lea	32(%ecx), %ecx;			\
+	add	%ecx, %edx;			\
+	lea	x(%ecx, %eax), %eax;		\
+	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+/* the macro definition of shift copy in stage one . 32bytes */
+.macro SHL_COPY_STAGE_ONE x
+	movaps	16(%eax), %xmm2
+	sub	$32, %ecx
+	movaps	32(%eax), %xmm3
+	lea	32(%eax), %eax
+	movdqa	%xmm3, %xmm4
+	palignr	\x, %xmm2, %xmm3
+	lea	32(%edx), %edx
+	palignr	\x, %xmm1, %xmm2
+	por	%xmm6, %xmm2
+	movaps	%xmm2, -32(%edx)
+	por	%xmm6, %xmm3
+	movaps	%xmm3, -16(%edx)
+.endm
+
+/* the macro definition of shift copy in stage two . 32bytes */
+.macro SHL_COPY_STAGE_TWO x
+	movaps	16(%eax), %xmm2
+	sub	$32, %ecx
+	movaps	32(%eax), %xmm3
+	lea	32(%eax), %eax
+	movdqa	%xmm3, %xmm1
+	palignr	\x, %xmm2, %xmm3
+	lea	32(%edx), %edx
+	palignr	\x, %xmm4, %xmm2
+	por	%xmm6, %xmm2
+	movaps	%xmm2, -32(%edx)
+	por	%xmm6, %xmm3
+	movaps	%xmm3, -16(%edx)
+.endm
+
+/* the macro definition of forward copy for the remaining small bytes.*/
+.macro fwd_write_bytes x
+	movl	$0xff000000, %ecx
+	or	-\x(%eax), %ecx
+	movl	%ecx, -\x(%edx)
+.endm
+
+#define FWD_WRITE_0BYTES			\
+	movl	DEST(%esp), %eax;		\
+	RETURN
+
+#endif
diff --git a/pixman/pixman-ssse3.c b/pixman/pixman-ssse3.c
index 8025ced..69d7a4a 100644
--- a/pixman/pixman-ssse3.c
+++ b/pixman/pixman-ssse3.c
@@ -32,7 +32,54 @@
 #include "pixman-private.h"
 
 #ifdef USE_SSSE3
+
+/*---------------------------------------------------------------------
+ * src_x888_8888
+ */
+extern void *composite_line_src_x888_8888_ssse3(uint32_t * dest,
+						uint32_t *src,
+						int32_t count);
+static void
+ssse3_composite_src_x888_8888(pixman_implementation_t	*imp,
+			      pixman_op_t		op,
+			      pixman_image_t		*src_image,
+			      pixman_image_t		*mask_image,
+			      pixman_image_t		*dst_image,
+			      int32_t			src_x,
+			      int32_t			src_y,
+			      int32_t			mask_x,
+			      int32_t			mask_y,
+			      int32_t			dest_x,
+			      int32_t			dest_y,
+			      int32_t			width,
+			      int32_t			height)
+{
+    uint32_t    *dst_line, *dst;
+    uint32_t    *src_line, *src;
+    int32_t w;
+    int dst_stride, src_stride;
+
+
+    PIXMAN_IMAGE_GET_LINE (
+	dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+       composite_line_src_x888_8888_ssse3(dst, src, w*4);
+    }
+}
+
 static const pixman_fast_path_t ssse3_fast_paths[] = {
+    /* PIXMAN_OP_OVER */
+    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, ssse3_composite_src_x888_8888),
+    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, ssse3_composite_src_x888_8888),
     { PIXMAN_OP_NONE },
 };
 
-- 
1.7.0.4