[Pixman] [ssse3]Optimization for fetch_scanline_x8r8g8b8
Ma, Ling
ling.ma at intel.com
Tue Sep 7 04:06:40 PDT 2010
> Your code still can be simplified a lot. I'm just not quite sure whether it would
> be more practical to commit something first and then refactor it with the follow
> up commits. Or attempt to make a "perfect" patch before committing.
[Ma Ling] Yes, I agree with you, let us commit it first, then strength it,
such as appending non-temporary instructions for large data copy which is over L1 cache size.
Best Regards
Ling
>
>
> pixman/pixman-access-ssse3_x86-64.S | 96 ++++------------------------------
> 1 files changed, 12 insertions(+), 84 deletions(-)
>
> diff --git a/pixman/pixman-access-ssse3_x86-64.S b/pixman/pixman-access-
> ssse3_x86-64.S index e7cf21f..0946d20 100755
> --- a/pixman/pixman-access-ssse3_x86-64.S
> +++ b/pixman/pixman-access-ssse3_x86-64.S
> @@ -248,116 +248,44 @@ L(shl_0_cache_less_16bytes):
> add %rdx, %rdi
> BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %rdx, 4)
>
> -L(shl_4):
> +.irp shift, 4, 8, 12
> +L(shl_\shift):
> lea -32(%rdx), %rdx
> ALIGN (4)
> -L(shl_4_loop):
> +L(shl_\shift\()_loop):
> movaps 16(%rsi), %xmm2
> sub $32, %rdx
> movaps 32(%rsi), %xmm3
> lea 32(%rsi), %rsi
> movdqa %xmm3, %xmm4
> - palignr $4, %xmm2, %xmm3
> + palignr $\shift, %xmm2, %xmm3
> lea 32(%rdi), %rdi
> - palignr $4, %xmm1, %xmm2
> + palignr $\shift, %xmm1, %xmm2
> por %xmm6, %xmm2
> movaps %xmm2, -32(%rdi)
> por %xmm6, %xmm3
> movaps %xmm3, -16(%rdi)
> - jb L(shl_4_end)
> + jb L(shl_\shift\()_end)
>
> movaps 16(%rsi), %xmm2
> sub $32, %rdx
> movaps 32(%rsi), %xmm3
> lea 32(%rsi), %rsi
> movdqa %xmm3, %xmm1
> - palignr $4, %xmm2, %xmm3
> + palignr $\shift, %xmm2, %xmm3
> lea 32(%rdi), %rdi
> - palignr $4, %xmm4, %xmm2
> + palignr $\shift, %xmm4, %xmm2
> por %xmm6, %xmm2
> movaps %xmm2, -32(%rdi)
> por %xmm6, %xmm3
> movaps %xmm3, -16(%rdi)
> - jae L(shl_4_loop)
> -L(shl_4_end):
> + jae L(shl_\shift\()_loop)
> +L(shl_\shift\()_end):
> lea 32(%rdx), %rdx
> - lea 4(%rsi, %rdx), %rsi
> - add %rdx, %rdi
> - BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %rdx, 4)
> -
> -L(shl_8):
> - lea -32(%rdx), %rdx
> - ALIGN (4)
> -L(shl_8_loop):
> - movaps 16(%rsi), %xmm2
> - sub $32, %rdx
> - movaps 32(%rsi), %xmm3
> - lea 32(%rsi), %rsi
> - movdqa %xmm3, %xmm4
> - palignr $8, %xmm2, %xmm3
> - lea 32(%rdi), %rdi
> - palignr $8, %xmm1, %xmm2
> - por %xmm6, %xmm2
> - movaps %xmm2, -32(%rdi)
> - por %xmm6, %xmm3
> - movaps %xmm3, -16(%rdi)
> - jb L(shl_8_end)
> -
> - movaps 16(%rsi), %xmm2
> - sub $32, %rdx
> - movaps 32(%rsi), %xmm3
> - lea 32(%rsi), %rsi
> - movdqa %xmm3, %xmm1
> - palignr $8, %xmm2, %xmm3
> - lea 32(%rdi), %rdi
> - palignr $8, %xmm4, %xmm2
> - por %xmm6, %xmm2
> - movaps %xmm2, -32(%rdi)
> - por %xmm6, %xmm3
> - movaps %xmm3, -16(%rdi)
> - jae L(shl_8_loop)
> -L(shl_8_end):
> - lea 32(%rdx), %rdx
> - lea 8(%rsi, %rdx), %rsi
> - add %rdx, %rdi
> - BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %rdx, 4)
> -
> -L(shl_12):
> - lea -32(%rdx), %rdx
> - ALIGN (4)
> -L(shl_12_loop):
> - movaps 16(%rsi), %xmm2
> - sub $32, %rdx
> - movaps 32(%rsi), %xmm3
> - lea 32(%rsi), %rsi
> - movdqa %xmm3, %xmm4
> - palignr $12, %xmm2, %xmm3
> - lea 32(%rdi), %rdi
> - palignr $12, %xmm1, %xmm2
> - por %xmm6, %xmm2
> - movaps %xmm2, -32(%rdi)
> - por %xmm6, %xmm3
> - movaps %xmm3, -16(%rdi)
> - jb L(shl_12_end)
> -
> - movaps 16(%rsi), %xmm2
> - sub $32, %rdx
> - movaps 32(%rsi), %xmm3
> - lea 32(%rsi), %rsi
> - movdqa %xmm3, %xmm1
> - palignr $12, %xmm2, %xmm3
> - lea 32(%rdi), %rdi
> - palignr $12, %xmm4, %xmm2
> - por %xmm6, %xmm2
> - movaps %xmm2, -32(%rdi)
> - por %xmm6, %xmm3
> - movaps %xmm3, -16(%rdi)
> - jae L(shl_12_loop)
> -L(shl_12_end):
> - lea 32(%rdx), %rdx
> - lea 12(%rsi, %rdx), %rsi
> + lea \shift\()(%rsi, %rdx), %rsi
> add %rdx, %rdi
> BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %rdx, 4)
> +.endr
>
> ALIGN (4)
> L(fwd_write_44bytes):
More information about the Pixman
mailing list