[Pixman] [ssse3]Optimization for fetch_scanline_x8r8g8b8

Tue Sep 7 04:06:40 PDT 2010

> Your code still can be simplified a lot. I'm just not quite sure whether it would
> be more practical to commit something first and then refactor it with the follow
> up commits. Or attempt to make a "perfect" patch before committing.
[Ma Ling] Yes, I agree with you, let us commit it first, then strength it,
such as appending non-temporary instructions for large data copy which is over L1 cache size.

Best Regards
Ling
> 
> 
>  pixman/pixman-access-ssse3_x86-64.S |   96 ++++------------------------------
>  1 files changed, 12 insertions(+), 84 deletions(-)
> 
> diff --git a/pixman/pixman-access-ssse3_x86-64.S b/pixman/pixman-access-
> ssse3_x86-64.S index e7cf21f..0946d20 100755
> --- a/pixman/pixman-access-ssse3_x86-64.S
> +++ b/pixman/pixman-access-ssse3_x86-64.S
> @@ -248,116 +248,44 @@ L(shl_0_cache_less_16bytes):
>         add     %rdx, %rdi
>         BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %rdx, 4)
> 
> -L(shl_4):
> +.irp shift, 4, 8, 12
> +L(shl_\shift):
>         lea     -32(%rdx), %rdx
>         ALIGN (4)
> -L(shl_4_loop):
> +L(shl_\shift\()_loop):
>         movaps  16(%rsi), %xmm2
>         sub     $32, %rdx
>         movaps  32(%rsi), %xmm3
>         lea     32(%rsi), %rsi
>         movdqa  %xmm3, %xmm4
> -       palignr $4, %xmm2, %xmm3
> +       palignr $\shift, %xmm2, %xmm3
>         lea     32(%rdi), %rdi
> -       palignr $4, %xmm1, %xmm2
> +       palignr $\shift, %xmm1, %xmm2
>         por     %xmm6, %xmm2
>         movaps  %xmm2, -32(%rdi)
>         por     %xmm6, %xmm3
>         movaps  %xmm3, -16(%rdi)
> -       jb      L(shl_4_end)
> +       jb      L(shl_\shift\()_end)
> 
>         movaps  16(%rsi), %xmm2
>         sub     $32, %rdx
>         movaps  32(%rsi), %xmm3
>         lea     32(%rsi), %rsi
>         movdqa  %xmm3, %xmm1
> -       palignr $4, %xmm2, %xmm3
> +       palignr $\shift, %xmm2, %xmm3
>         lea     32(%rdi), %rdi
> -       palignr $4, %xmm4, %xmm2
> +       palignr $\shift, %xmm4, %xmm2
>         por     %xmm6, %xmm2
>         movaps  %xmm2, -32(%rdi)
>         por     %xmm6, %xmm3
>         movaps  %xmm3, -16(%rdi)
> -       jae     L(shl_4_loop)
> -L(shl_4_end):
> +       jae     L(shl_\shift\()_loop)
> +L(shl_\shift\()_end):
>         lea     32(%rdx), %rdx
> -       lea     4(%rsi, %rdx), %rsi
> -       add     %rdx, %rdi
> -       BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %rdx, 4)
> -
> -L(shl_8):
> -       lea     -32(%rdx), %rdx
> -       ALIGN (4)
> -L(shl_8_loop):
> -       movaps  16(%rsi), %xmm2
> -       sub     $32, %rdx
> -       movaps  32(%rsi), %xmm3
> -       lea     32(%rsi), %rsi
> -       movdqa  %xmm3, %xmm4
> -       palignr $8, %xmm2, %xmm3
> -       lea     32(%rdi), %rdi
> -       palignr $8, %xmm1, %xmm2
> -       por     %xmm6, %xmm2
> -       movaps  %xmm2, -32(%rdi)
> -       por     %xmm6, %xmm3
> -       movaps  %xmm3, -16(%rdi)
> -       jb      L(shl_8_end)
> -
> -       movaps  16(%rsi), %xmm2
> -       sub     $32, %rdx
> -       movaps  32(%rsi), %xmm3
> -       lea     32(%rsi), %rsi
> -       movdqa  %xmm3, %xmm1
> -       palignr $8, %xmm2, %xmm3
> -       lea     32(%rdi), %rdi
> -       palignr $8, %xmm4, %xmm2
> -       por     %xmm6, %xmm2
> -       movaps  %xmm2, -32(%rdi)
> -       por     %xmm6, %xmm3
> -       movaps  %xmm3, -16(%rdi)
> -       jae     L(shl_8_loop)
> -L(shl_8_end):
> -       lea     32(%rdx), %rdx
> -       lea     8(%rsi, %rdx), %rsi
> -       add     %rdx, %rdi
> -       BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %rdx, 4)
> -
> -L(shl_12):
> -       lea     -32(%rdx), %rdx
> -       ALIGN (4)
> -L(shl_12_loop):
> -       movaps  16(%rsi), %xmm2
> -       sub     $32, %rdx
> -       movaps  32(%rsi), %xmm3
> -       lea     32(%rsi), %rsi
> -       movdqa  %xmm3, %xmm4
> -       palignr $12, %xmm2, %xmm3
> -       lea     32(%rdi), %rdi
> -       palignr $12, %xmm1, %xmm2
> -       por     %xmm6, %xmm2
> -       movaps  %xmm2, -32(%rdi)
> -       por     %xmm6, %xmm3
> -       movaps  %xmm3, -16(%rdi)
> -       jb      L(shl_12_end)
> -
> -       movaps  16(%rsi), %xmm2
> -       sub     $32, %rdx
> -       movaps  32(%rsi), %xmm3
> -       lea     32(%rsi), %rsi
> -       movdqa  %xmm3, %xmm1
> -       palignr $12, %xmm2, %xmm3
> -       lea     32(%rdi), %rdi
> -       palignr $12, %xmm4, %xmm2
> -       por     %xmm6, %xmm2
> -       movaps  %xmm2, -32(%rdi)
> -       por     %xmm6, %xmm3
> -       movaps  %xmm3, -16(%rdi)
> -       jae     L(shl_12_loop)
> -L(shl_12_end):
> -       lea     32(%rdx), %rdx
> -       lea     12(%rsi, %rdx), %rsi
> +       lea     \shift\()(%rsi, %rdx), %rsi
>         add     %rdx, %rdi
>         BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %rdx, 4)
> +.endr
> 
>         ALIGN (4)
>  L(fwd_write_44bytes):