[Pixman] [ssse3]Optimization for fetch_scanline_x8r8g8b8

Fri Sep 3 05:47:04 PDT 2010

On Friday 03 September 2010 11:53:47 Xu, Samuel wrote:
> >* Siarhei asked whether it would be possible to unify the 32 and 64
> >
> >  bit assembly sources. I don't think you commented on that.
> 
> I think it is very difficult to unify 32 and 64 bit assemble src.

It's not so difficult if you change the explicit register names to some macros. 
Then you can have something like SRC_PTR in the code which can expand to %rsi 
for x86-64 and to %esi for x86.

There are other things which can be improved too.  For example the use of 
".irp/.endr" directive can reduce source code size by collapsing the repeatable 
blocks.

Your code still can be simplified a lot. I'm just not quite sure whether it
would be more practical to commit something first and then refactor it with
the follow up commits. Or attempt to make a "perfect" patch before committing.


 pixman/pixman-access-ssse3_x86-64.S |   96 ++++------------------------------
 1 files changed, 12 insertions(+), 84 deletions(-)

diff --git a/pixman/pixman-access-ssse3_x86-64.S b/pixman/pixman-access-
ssse3_x86-64.S
index e7cf21f..0946d20 100755
--- a/pixman/pixman-access-ssse3_x86-64.S
+++ b/pixman/pixman-access-ssse3_x86-64.S
@@ -248,116 +248,44 @@ L(shl_0_cache_less_16bytes):
        add     %rdx, %rdi
        BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %rdx, 4)
 
-L(shl_4):
+.irp shift, 4, 8, 12
+L(shl_\shift):
        lea     -32(%rdx), %rdx
        ALIGN (4)
-L(shl_4_loop):
+L(shl_\shift\()_loop):
        movaps  16(%rsi), %xmm2
        sub     $32, %rdx
        movaps  32(%rsi), %xmm3
        lea     32(%rsi), %rsi
        movdqa  %xmm3, %xmm4
-       palignr $4, %xmm2, %xmm3
+       palignr $\shift, %xmm2, %xmm3
        lea     32(%rdi), %rdi
-       palignr $4, %xmm1, %xmm2
+       palignr $\shift, %xmm1, %xmm2
        por     %xmm6, %xmm2
        movaps  %xmm2, -32(%rdi)
        por     %xmm6, %xmm3
        movaps  %xmm3, -16(%rdi)
-       jb      L(shl_4_end)
+       jb      L(shl_\shift\()_end)
 
        movaps  16(%rsi), %xmm2
        sub     $32, %rdx
        movaps  32(%rsi), %xmm3
        lea     32(%rsi), %rsi
        movdqa  %xmm3, %xmm1
-       palignr $4, %xmm2, %xmm3
+       palignr $\shift, %xmm2, %xmm3
        lea     32(%rdi), %rdi
-       palignr $4, %xmm4, %xmm2
+       palignr $\shift, %xmm4, %xmm2
        por     %xmm6, %xmm2
        movaps  %xmm2, -32(%rdi)
        por     %xmm6, %xmm3
        movaps  %xmm3, -16(%rdi)
-       jae     L(shl_4_loop)
-L(shl_4_end):
+       jae     L(shl_\shift\()_loop)
+L(shl_\shift\()_end):
        lea     32(%rdx), %rdx
-       lea     4(%rsi, %rdx), %rsi
-       add     %rdx, %rdi
-       BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %rdx, 4)
-
-L(shl_8):
-       lea     -32(%rdx), %rdx
-       ALIGN (4)
-L(shl_8_loop):
-       movaps  16(%rsi), %xmm2
-       sub     $32, %rdx
-       movaps  32(%rsi), %xmm3
-       lea     32(%rsi), %rsi
-       movdqa  %xmm3, %xmm4
-       palignr $8, %xmm2, %xmm3
-       lea     32(%rdi), %rdi
-       palignr $8, %xmm1, %xmm2
-       por     %xmm6, %xmm2
-       movaps  %xmm2, -32(%rdi)
-       por     %xmm6, %xmm3
-       movaps  %xmm3, -16(%rdi)
-       jb      L(shl_8_end)
-
-       movaps  16(%rsi), %xmm2
-       sub     $32, %rdx
-       movaps  32(%rsi), %xmm3
-       lea     32(%rsi), %rsi
-       movdqa  %xmm3, %xmm1
-       palignr $8, %xmm2, %xmm3
-       lea     32(%rdi), %rdi
-       palignr $8, %xmm4, %xmm2
-       por     %xmm6, %xmm2
-       movaps  %xmm2, -32(%rdi)
-       por     %xmm6, %xmm3
-       movaps  %xmm3, -16(%rdi)
-       jae     L(shl_8_loop)
-L(shl_8_end):
-       lea     32(%rdx), %rdx
-       lea     8(%rsi, %rdx), %rsi
-       add     %rdx, %rdi
-       BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %rdx, 4)
-
-L(shl_12):
-       lea     -32(%rdx), %rdx
-       ALIGN (4)
-L(shl_12_loop):
-       movaps  16(%rsi), %xmm2
-       sub     $32, %rdx
-       movaps  32(%rsi), %xmm3
-       lea     32(%rsi), %rsi
-       movdqa  %xmm3, %xmm4
-       palignr $12, %xmm2, %xmm3
-       lea     32(%rdi), %rdi
-       palignr $12, %xmm1, %xmm2
-       por     %xmm6, %xmm2
-       movaps  %xmm2, -32(%rdi)
-       por     %xmm6, %xmm3
-       movaps  %xmm3, -16(%rdi)
-       jb      L(shl_12_end)
-
-       movaps  16(%rsi), %xmm2
-       sub     $32, %rdx
-       movaps  32(%rsi), %xmm3
-       lea     32(%rsi), %rsi
-       movdqa  %xmm3, %xmm1
-       palignr $12, %xmm2, %xmm3
-       lea     32(%rdi), %rdi
-       palignr $12, %xmm4, %xmm2
-       por     %xmm6, %xmm2
-       movaps  %xmm2, -32(%rdi)
-       por     %xmm6, %xmm3
-       movaps  %xmm3, -16(%rdi)
-       jae     L(shl_12_loop)
-L(shl_12_end):
-       lea     32(%rdx), %rdx
-       lea     12(%rsi, %rdx), %rsi
+       lea     \shift\()(%rsi, %rdx), %rsi
        add     %rdx, %rdi
        BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %rdx, 4)
+.endr
 
        ALIGN (4)
 L(fwd_write_44bytes):
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 198 bytes
Desc: This is a digitally signed message part.
URL: <http://lists.freedesktop.org/archives/pixman/attachments/20100903/fccf829a/attachment.pgp>