[Pixman] [ssse3]Optimization for fetch_scanline_x8r8g8b8
Siarhei Siamashka
siarhei.siamashka at gmail.com
Fri Sep 3 05:47:04 PDT 2010
On Friday 03 September 2010 11:53:47 Xu, Samuel wrote:
> >* Siarhei asked whether it would be possible to unify the 32 and 64
> >
> > bit assembly sources. I don't think you commented on that.
>
> I think it is very difficult to unify 32 and 64 bit assemble src.
It's not so difficult if you change the explicit register names to some macros.
Then you can have something like SRC_PTR in the code which can expand to %rsi
for x86-64 and to %esi for x86.
There are other things which can be improved too. For example the use of
".irp/.endr" directive can reduce source code size by collapsing the repeatable
blocks.
Your code still can be simplified a lot. I'm just not quite sure whether it
would be more practical to commit something first and then refactor it with
the follow up commits. Or attempt to make a "perfect" patch before committing.
pixman/pixman-access-ssse3_x86-64.S | 96 ++++------------------------------
1 files changed, 12 insertions(+), 84 deletions(-)
diff --git a/pixman/pixman-access-ssse3_x86-64.S b/pixman/pixman-access-
ssse3_x86-64.S
index e7cf21f..0946d20 100755
--- a/pixman/pixman-access-ssse3_x86-64.S
+++ b/pixman/pixman-access-ssse3_x86-64.S
@@ -248,116 +248,44 @@ L(shl_0_cache_less_16bytes):
add %rdx, %rdi
BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %rdx, 4)
-L(shl_4):
+.irp shift, 4, 8, 12
+L(shl_\shift):
lea -32(%rdx), %rdx
ALIGN (4)
-L(shl_4_loop):
+L(shl_\shift\()_loop):
movaps 16(%rsi), %xmm2
sub $32, %rdx
movaps 32(%rsi), %xmm3
lea 32(%rsi), %rsi
movdqa %xmm3, %xmm4
- palignr $4, %xmm2, %xmm3
+ palignr $\shift, %xmm2, %xmm3
lea 32(%rdi), %rdi
- palignr $4, %xmm1, %xmm2
+ palignr $\shift, %xmm1, %xmm2
por %xmm6, %xmm2
movaps %xmm2, -32(%rdi)
por %xmm6, %xmm3
movaps %xmm3, -16(%rdi)
- jb L(shl_4_end)
+ jb L(shl_\shift\()_end)
movaps 16(%rsi), %xmm2
sub $32, %rdx
movaps 32(%rsi), %xmm3
lea 32(%rsi), %rsi
movdqa %xmm3, %xmm1
- palignr $4, %xmm2, %xmm3
+ palignr $\shift, %xmm2, %xmm3
lea 32(%rdi), %rdi
- palignr $4, %xmm4, %xmm2
+ palignr $\shift, %xmm4, %xmm2
por %xmm6, %xmm2
movaps %xmm2, -32(%rdi)
por %xmm6, %xmm3
movaps %xmm3, -16(%rdi)
- jae L(shl_4_loop)
-L(shl_4_end):
+ jae L(shl_\shift\()_loop)
+L(shl_\shift\()_end):
lea 32(%rdx), %rdx
- lea 4(%rsi, %rdx), %rsi
- add %rdx, %rdi
- BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %rdx, 4)
-
-L(shl_8):
- lea -32(%rdx), %rdx
- ALIGN (4)
-L(shl_8_loop):
- movaps 16(%rsi), %xmm2
- sub $32, %rdx
- movaps 32(%rsi), %xmm3
- lea 32(%rsi), %rsi
- movdqa %xmm3, %xmm4
- palignr $8, %xmm2, %xmm3
- lea 32(%rdi), %rdi
- palignr $8, %xmm1, %xmm2
- por %xmm6, %xmm2
- movaps %xmm2, -32(%rdi)
- por %xmm6, %xmm3
- movaps %xmm3, -16(%rdi)
- jb L(shl_8_end)
-
- movaps 16(%rsi), %xmm2
- sub $32, %rdx
- movaps 32(%rsi), %xmm3
- lea 32(%rsi), %rsi
- movdqa %xmm3, %xmm1
- palignr $8, %xmm2, %xmm3
- lea 32(%rdi), %rdi
- palignr $8, %xmm4, %xmm2
- por %xmm6, %xmm2
- movaps %xmm2, -32(%rdi)
- por %xmm6, %xmm3
- movaps %xmm3, -16(%rdi)
- jae L(shl_8_loop)
-L(shl_8_end):
- lea 32(%rdx), %rdx
- lea 8(%rsi, %rdx), %rsi
- add %rdx, %rdi
- BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %rdx, 4)
-
-L(shl_12):
- lea -32(%rdx), %rdx
- ALIGN (4)
-L(shl_12_loop):
- movaps 16(%rsi), %xmm2
- sub $32, %rdx
- movaps 32(%rsi), %xmm3
- lea 32(%rsi), %rsi
- movdqa %xmm3, %xmm4
- palignr $12, %xmm2, %xmm3
- lea 32(%rdi), %rdi
- palignr $12, %xmm1, %xmm2
- por %xmm6, %xmm2
- movaps %xmm2, -32(%rdi)
- por %xmm6, %xmm3
- movaps %xmm3, -16(%rdi)
- jb L(shl_12_end)
-
- movaps 16(%rsi), %xmm2
- sub $32, %rdx
- movaps 32(%rsi), %xmm3
- lea 32(%rsi), %rsi
- movdqa %xmm3, %xmm1
- palignr $12, %xmm2, %xmm3
- lea 32(%rdi), %rdi
- palignr $12, %xmm4, %xmm2
- por %xmm6, %xmm2
- movaps %xmm2, -32(%rdi)
- por %xmm6, %xmm3
- movaps %xmm3, -16(%rdi)
- jae L(shl_12_loop)
-L(shl_12_end):
- lea 32(%rdx), %rdx
- lea 12(%rsi, %rdx), %rsi
+ lea \shift\()(%rsi, %rdx), %rsi
add %rdx, %rdi
BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %rdx, 4)
+.endr
ALIGN (4)
L(fwd_write_44bytes):
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 198 bytes
Desc: This is a digitally signed message part.
URL: <http://lists.freedesktop.org/archives/pixman/attachments/20100903/fccf829a/attachment.pgp>
More information about the Pixman
mailing list