[Pixman] [PATCH] mmx: compile on MIPS for Loongson-3A MMI optimizations
Matt Turner
mattst88 at gmail.com
Wed Sep 19 18:56:00 UTC 2018
On Tue, Sep 18, 2018 at 2:34 AM <xianjudiao at gmail.com> wrote:
>
> From: Xianju Diao <xianjudiao at gmail.com>
>
> make check:
> when I enable the USE_OPENMP, the test of 'glyph-test' and 'cover-test' will failed on Loongson-3A3000.
> Neither of the two test examples passed without optimizing the code.Maybe be multi-core synchronization
> of cpu bug,I will continue to debug this problem, Now, I use the critical of openMP, 'glyph-test' and '
> cover-test' can passed.
>
> benchmark:
> Running cairo-perf-trace benchmark on Loongson-3A.
> image image16
> gvim 5.425 -> 5.069 5.531 -> 5.236
> popler-reseau 2.149 -> 2.13 2.152 -> 2.139
> swfdec-giant-steps-full 18.672 -> 8.215 33.167 -> 18.28
> swfdec-giant-steps 7.014 -> 2.455 12.48 -> 5.982
> xfce4-terminal-al 13.695 -> 5.241 15.703 -> 5.859
> gonme-system-monitor 12.783 -> 7.058 12.780 -> 7.104
> grads-heat-map 0.482 -> 0.486 0.516 -> 0.514
> firefox-talos-svg 141.138 -> 134.621 152.495 -> 159.069
> firefox-talos-gfx 23.119 -> 14.437 24.870 -> 15.161
> firefox-world-map 32.018 -> 27.139 33.817 -> 28.085
> firefox-periodic-table 12.305 -> 12.443 12.876 -> 12.913
> evolution 7.071 -> 3.564 8.550 -> 3.784
> firefox-planet-gnome 77.926 -> 67.526 81.554 -> 65.840
> ocitysmap 4.934 -> 1.702 4.937 -> 1.701
> ---
Thanks for the patch. I will review it when I have time (I'm preparing
for a trip at the moment).
I have a Loongson3 system that I have found to be unstable. I assume
it is due to the hardware bugs that must be worked around in gcc and
binutils. I have patched both of them with the patches I found in
https://github.com/loongson-community/binutils-gdb etc, but I still
have instability. I would appreciate it very much if you could offer
some suggestions or help in improving the stability of my system.
Looks like there are a couple of different things happening in this
patch. We should try to split them up. One patch could be making the
assembly memcpy implementation usable on mips64. A separate patch
would add new functions to pixman-mmx.c.
A few quick comments inline.
> configure.ac | 7 +-
> pixman/Makefile.am | 4 +-
> pixman/loongson-mmintrin.h | 46 ++
> pixman/pixman-combine32.h | 6 +
> pixman/pixman-mips-dspr2-asm.h | 2 +-
> pixman/pixman-mips-memcpy-asm.S | 324 +++++-------
> pixman/pixman-mmx.c | 1088 ++++++++++++++++++++++++++++++++++++++-
> pixman/pixman-private.h | 32 +-
> pixman/pixman-solid-fill.c | 49 +-
> pixman/pixman-utils.c | 65 ++-
> test/Makefile.am | 2 +-
> test/utils.c | 8 +
This diff stat doesn't correspond to this patch.
> 12 files changed, 1418 insertions(+), 215 deletions(-)
>
> diff --git a/configure.ac b/configure.ac
> index e833e45..3e3dde5 100644
> --- a/configure.ac
> +++ b/configure.ac
> @@ -154,9 +154,9 @@ AC_CHECK_DECL([__amd64], [AMD64_ABI="yes"], [AMD64_ABI="no"])
> # has set CFLAGS.
> if test $SUNCC = yes && \
> test "x$test_CFLAGS" = "x" && \
> - test "$CFLAGS" = "-g"
> + test "$CFLAGS" = "-g -mabi=n64"
> then
> - CFLAGS="-O -g"
> + CFLAGS="-O -g -mabi=n64"
This isn't acceptable.
> fi
>
> #
> @@ -183,6 +183,7 @@ AC_SUBST(LT_VERSION_INFO)
> # Check for dependencies
>
> PIXMAN_CHECK_CFLAG([-Wall])
> +PIXMAN_CHECK_CFLAG([-mabi=n64])
> PIXMAN_CHECK_CFLAG([-Wdeclaration-after-statement])
> PIXMAN_CHECK_CFLAG([-Wno-unused-local-typedefs])
> PIXMAN_CHECK_CFLAG([-fno-strict-aliasing])
> @@ -273,7 +274,7 @@ dnl ===========================================================================
> dnl Check for Loongson Multimedia Instructions
>
> if test "x$LS_CFLAGS" = "x" ; then
> - LS_CFLAGS="-march=loongson2f"
> + LS_CFLAGS="-march=loongson3a"
Also not acceptable. I see that recent gcc and binutils have gotten
new options for enabling MMI separately from -march=loongson*. Maybe
we could use those if available.
I'm not sure there is currently a good solution. Let me think about it.
> fi
>
> have_loongson_mmi=no
> diff --git a/pixman/Makefile.am b/pixman/Makefile.am
> index 581b6f6..e3a080c 100644
> --- a/pixman/Makefile.am
> +++ b/pixman/Makefile.am
> @@ -122,7 +122,7 @@ libpixman_mips_dspr2_la_SOURCES = \
> pixman-mips-dspr2.h \
> pixman-mips-dspr2-asm.S \
> pixman-mips-dspr2-asm.h \
> - pixman-mips-memcpy-asm.S
> + #pixman-mips-memcpy-asm.S
Can't do this.
> libpixman_1_la_LIBADD += libpixman-mips-dspr2.la
>
> ASM_CFLAGS_mips_dspr2=
> @@ -131,7 +131,7 @@ endif
> # loongson code
> if USE_LOONGSON_MMI
> noinst_LTLIBRARIES += libpixman-loongson-mmi.la
> -libpixman_loongson_mmi_la_SOURCES = pixman-mmx.c loongson-mmintrin.h
> +libpixman_loongson_mmi_la_SOURCES = pixman-mmx.c loongson-mmintrin.h pixman-mips-memcpy-asm.S
> libpixman_loongson_mmi_la_CFLAGS = $(LS_CFLAGS)
> libpixman_1_la_LDFLAGS += $(LS_LDFLAGS)
> libpixman_1_la_LIBADD += libpixman-loongson-mmi.la
> diff --git a/pixman/loongson-mmintrin.h b/pixman/loongson-mmintrin.h
> index 086c6e0..f049463 100644
> --- a/pixman/loongson-mmintrin.h
> +++ b/pixman/loongson-mmintrin.h
> @@ -89,6 +89,17 @@ _mm_adds_pu8 (__m64 __m1, __m64 __m2)
> }
>
> extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
> +_mm_andn_si64 (__m64 __m1, __m64 __m2)
Doesn't seem to be used.
> +{
> + __m64 ret;
> + asm("pandn %0, %1, %2\n\t"
> + : "=f" (ret)
> + : "f" (__m1), "f"(__m2)
> + );
> + return ret;
> +}
> +
> +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
> _mm_and_si64 (__m64 __m1, __m64 __m2)
> {
> __m64 ret;
> @@ -100,6 +111,17 @@ _mm_and_si64 (__m64 __m1, __m64 __m2)
> }
>
> extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
> +_mm_cmpeq_pi16 (__m64 __m1, __m64 __m2)
Doesn't seem to be used.
> +{
> + __m64 ret;
Whitespace mistake.
> + asm("pcmpeqh %0, %1, %2\n\t"
> + : "=f" (ret)
> + : "f" (__m1), "f" (__m2)
> + );
> + return ret;
> +}
> +
> +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
> _mm_cmpeq_pi32 (__m64 __m1, __m64 __m2)
> {
> __m64 ret;
> @@ -110,6 +132,30 @@ _mm_cmpeq_pi32 (__m64 __m1, __m64 __m2)
> return ret;
> }
>
> +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
> +loongson_fand (__m64 __m1, __m64 __m2)
Doesn't seem to be used.
> +{
> + __m64 ret;
> + asm("fand %0, %1, %2\n\t"
> + : "=f" (ret)
> + : "f" (__m1), "f" (__m2)
> + );
> + return ret;
> +}
> +
> +
> +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
> +_mm_cmpgt_pi16 (__m64 __m1, __m64 __m2)
> +{
> + __m64 ret;
> + asm("pcmpgth %0, %1, %2\n\t"
> + : "=f" (ret)
> + : "f" (__m1), "f" (__m2)
> + );
> + return ret;
> +}
> +
> +
> extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
> _mm_empty (void)
> {
> diff --git a/pixman/pixman-combine32.h b/pixman/pixman-combine32.h
> index cdd56a6..27f62d9 100644
> --- a/pixman/pixman-combine32.h
> +++ b/pixman/pixman-combine32.h
> @@ -14,6 +14,12 @@
> #define RB_ONE_HALF 0x800080
> #define RB_MASK_PLUS_ONE 0x10000100
>
> +#define RGB_MASK 0xffffff
> +#define RGB_DMASK 0xffffffffffffULL
> +#define R_DMASK 0x0000ffff00000000ULL
> +#define G_DMASK 0x00000000ffff0000ULL
> +#define B_DMASK 0x000000000000ffffULL
> +
> #define ALPHA_8(x) ((x) >> A_SHIFT)
> #define RED_8(x) (((x) >> R_SHIFT) & MASK)
> #define GREEN_8(x) (((x) >> G_SHIFT) & MASK)
> diff --git a/pixman/pixman-mips-dspr2-asm.h b/pixman/pixman-mips-dspr2-asm.h
> index e238566..63d7d96 100644
> --- a/pixman/pixman-mips-dspr2-asm.h
> +++ b/pixman/pixman-mips-dspr2-asm.h
> @@ -77,7 +77,7 @@
> .ent symbol, 0; \
> symbol: .frame sp, 0, ra; \
> .set push; \
> - .set arch=mips32r2; \
> + .set arch=mips64r2; \
Can't do this.
> .set noreorder; \
> .set noat;
>
> diff --git a/pixman/pixman-mips-memcpy-asm.S b/pixman/pixman-mips-memcpy-asm.S
> index 9ad6da5..a140191 100644
> --- a/pixman/pixman-mips-memcpy-asm.S
> +++ b/pixman/pixman-mips-memcpy-asm.S
> @@ -54,19 +54,20 @@ LEAF_MIPS32R2(pixman_mips_fast_memcpy)
>
> /* Test if the src and dst are word-aligned, or can be made word-aligned */
> xor t8, a1, a0
> - andi t8, t8, 0x3 /* t8 is a0/a1 word-displacement */
> + andi t8, t8, 0x7 /* t8 is a0/a1 word-displacement */
>
> bne t8, zero, $unaligned
> negu a3, a0
>
> - andi a3, a3, 0x3 /* we need to copy a3 bytes to make a0/a1 aligned */
> + andi a3, a3, 0x7 /* we need to copy a3 bytes to make a0/a1 aligned */
> beq a3, zero, $chk16w /* when a3=0 then the dst (a0) is word-aligned */
> subu a2, a2, a3 /* now a2 is the remining bytes count */
>
> - LWHI t8, 0(a1)
> - addu a1, a1, a3
> - SWHI t8, 0(a0)
> - addu a0, a0, a3
> + ld t8, 0(a1)
> + daddu a1, a1, a3
> + sdl t8, 7(a0)
> + sdr t8, 0(a0)
> + daddu a0, a0, a3
>
> /* Now the dst/src are mutually word-aligned with word-aligned addresses */
> $chk16w: andi t8, a2, 0x3f /* any whole 64-byte chunks? */
> @@ -76,9 +77,9 @@ $chk16w: andi t8, a2, 0x3f /* any whole 64-byte chunks? */
> /* There will be at most 1 32-byte chunk after it */
> subu a3, a2, t8 /* subtract from a2 the reminder */
> /* Here a3 counts bytes in 16w chunks */
> - addu a3, a0, a3 /* Now a3 is the final dst after 64-byte chunks */
> + daddu a3, a0, a3 /* Now a3 is the final dst after 64-byte chunks */
>
> - addu t0, a0, a2 /* t0 is the "past the end" address */
> + daddu t0, a0, a2 /* t0 is the "past the end" address */
>
> /*
> * When in the loop we exercise "pref 30, x(a0)", the a0+x should not be past
> @@ -89,119 +90,98 @@ $chk16w: andi t8, a2, 0x3f /* any whole 64-byte chunks? */
> */
> subu t9, t0, 160 /* t9 is the "last safe pref 30, 128(a0)" address */
>
> - pref 0, 0(a1) /* bring the first line of src, addr 0 */
> - pref 0, 32(a1) /* bring the second line of src, addr 32 */
> - pref 0, 64(a1) /* bring the third line of src, addr 64 */
> - pref 30, 32(a0) /* safe, as we have at least 64 bytes ahead */
> + lw $0, 0(a1) /* bring the first line of src, addr 0 */
> + lw $0, 32(a1) /* bring the second line of src, addr 32 */
> + lw $0, 64(a1) /* bring the third line of src, addr 64 */
> + lw $0, 32(a0) /* safe, as we have at least 64 bytes ahead */
> /* In case the a0 > t9 don't use "pref 30" at all */
> sgtu v1, a0, t9
> bgtz v1, $loop16w /* skip "pref 30, 64(a0)" for too short arrays */
> nop
> /* otherwise, start with using pref30 */
> - pref 30, 64(a0)
> + lw $0, 64(a0)
> $loop16w:
> - pref 0, 96(a1)
> - lw t0, 0(a1)
> + lw $0, 96(a1)
> + ld t0, 0(a1)
> bgtz v1, $skip_pref30_96 /* skip "pref 30, 96(a0)" */
> - lw t1, 4(a1)
> - pref 30, 96(a0) /* continue setting up the dest, addr 96 */
> + lw $0, 96(a0) /* continue setting up the dest, addr 96 */
> $skip_pref30_96:
> - lw t2, 8(a1)
> - lw t3, 12(a1)
> - lw t4, 16(a1)
> - lw t5, 20(a1)
> - lw t6, 24(a1)
> - lw t7, 28(a1)
> - pref 0, 128(a1) /* bring the next lines of src, addr 128 */
> -
> - sw t0, 0(a0)
> - sw t1, 4(a0)
> - sw t2, 8(a0)
> - sw t3, 12(a0)
> - sw t4, 16(a0)
> - sw t5, 20(a0)
> - sw t6, 24(a0)
> - sw t7, 28(a0)
> -
> - lw t0, 32(a1)
> + ld t2, 8(a1)
> + ld t4, 16(a1)
> + ld t6, 24(a1)
> + lw $0, 128(a1) /* bring the next lines of src, addr 128 */
> + lw $0, 0x0(a0)
> +
> + sd t0, 0(a0)
> + sd t2, 8(a0)
> + sd t4, 16(a0)
> + sd t6, 24(a0)
> +
> + ld t0, 32(a1)
> bgtz v1, $skip_pref30_128 /* skip "pref 30, 128(a0)" */
> - lw t1, 36(a1)
> - pref 30, 128(a0) /* continue setting up the dest, addr 128 */
> + lw $0, 128(a0) /* continue setting up the dest, addr 128 */
> $skip_pref30_128:
> - lw t2, 40(a1)
> - lw t3, 44(a1)
> - lw t4, 48(a1)
> - lw t5, 52(a1)
> - lw t6, 56(a1)
> - lw t7, 60(a1)
> - pref 0, 160(a1) /* bring the next lines of src, addr 160 */
> -
> - sw t0, 32(a0)
> - sw t1, 36(a0)
> - sw t2, 40(a0)
> - sw t3, 44(a0)
> - sw t4, 48(a0)
> - sw t5, 52(a0)
> - sw t6, 56(a0)
> - sw t7, 60(a0)
> -
> - addiu a0, a0, 64 /* adding 64 to dest */
> + ld t2, 40(a1)
> + ld t4, 48(a1)
> + ld t6, 56(a1)
> + lw $0, 160(a1) /* bring the next lines of src, addr 160 */
> + lw $0, 0x32(a0)
> +
> + sd t0, 32(a0)
> + sd t2, 40(a0)
> + sd t4, 48(a0)
> + sd t6, 56(a0)
> +
> + daddiu a0, a0, 64 /* adding 64 to dest */
> sgtu v1, a0, t9
> bne a0, a3, $loop16w
> - addiu a1, a1, 64 /* adding 64 to src */
> + daddiu a1, a1, 64 /* adding 64 to src */
> move a2, t8
>
> /* Here we have src and dest word-aligned but less than 64-bytes to go */
>
> $chk8w:
> - pref 0, 0x0(a1)
> + lw $0, 0x0(a1)
> andi t8, a2, 0x1f /* is there a 32-byte chunk? */
> /* the t8 is the reminder count past 32-bytes */
> beq a2, t8, $chk1w /* when a2=t8, no 32-byte chunk */
> nop
>
> - lw t0, 0(a1)
> - lw t1, 4(a1)
> - lw t2, 8(a1)
> - lw t3, 12(a1)
> - lw t4, 16(a1)
> - lw t5, 20(a1)
> - lw t6, 24(a1)
> - lw t7, 28(a1)
> - addiu a1, a1, 32
> -
> - sw t0, 0(a0)
> - sw t1, 4(a0)
> - sw t2, 8(a0)
> - sw t3, 12(a0)
> - sw t4, 16(a0)
> - sw t5, 20(a0)
> - sw t6, 24(a0)
> - sw t7, 28(a0)
> - addiu a0, a0, 32
> + ld t0, 0(a1)
> + ld t2, 8(a1)
> + ld t4, 16(a1)
> + ld t6, 24(a1)
> + lw $0, 0x0(a0)
> + daddiu a1, a1, 32
> +
> + sd t0, 0(a0)
> + sd t2, 8(a0)
> + sd t4, 16(a0)
> + sd t6, 24(a0)
> + daddiu a0, a0, 32
>
> $chk1w:
> andi a2, t8, 0x3 /* now a2 is the reminder past 1w chunks */
> beq a2, t8, $last8
> subu a3, t8, a2 /* a3 is count of bytes in 1w chunks */
> - addu a3, a0, a3 /* now a3 is the dst address past the 1w chunks */
> + daddu a3, a0, a3 /* now a3 is the dst address past the 1w chunks */
>
> /* copying in words (4-byte chunks) */
> $wordCopy_loop:
> lw t3, 0(a1) /* the first t3 may be equal t0 ... optimize? */
> - addiu a1, a1, 4
> - addiu a0, a0, 4
> + daddiu a1, a1, 4
> + daddiu a0, a0, 4
> bne a0, a3, $wordCopy_loop
> sw t3, -4(a0)
>
> /* For the last (<8) bytes */
> $last8:
> blez a2, leave
> - addu a3, a0, a2 /* a3 is the last dst address */
> + daddu a3, a0, a2 /* a3 is the last dst address */
> $last8loop:
> lb v1, 0(a1)
> - addiu a1, a1, 1
> - addiu a0, a0, 1
> + daddiu a1, a1, 1
> + daddiu a0, a0, 1
> bne a0, a3, $last8loop
> sb v1, -1(a0)
>
> @@ -214,15 +194,16 @@ leave: j ra
>
> $unaligned:
> /* got here with a3="negu a0" */
> - andi a3, a3, 0x3 /* test if the a0 is word aligned */
> + andi a3, a3, 0x7 /* test if the a0 is word aligned */
> beqz a3, $ua_chk16w
> subu a2, a2, a3 /* bytes left after initial a3 bytes */
>
> - LWHI v1, 0(a1)
> - LWLO v1, 3(a1)
> - addu a1, a1, a3 /* a3 may be here 1, 2 or 3 */
> - SWHI v1, 0(a0)
> - addu a0, a0, a3 /* below the dst will be word aligned (NOTE1) */
> + ldl v1, 7(a1)
> + ldr v1, 0(a1)
> + daddu a1, a1, a3 /* a3 may be here 1, 2 or 3 */
> + sdl v1, 7(a0)
> + sdr v1, 0(a0)
> + daddu a0, a0, a3 /* below the dst will be word aligned (NOTE1) */
>
> $ua_chk16w: andi t8, a2, 0x3f /* any whole 64-byte chunks? */
> /* t8 is the byte count after 64-byte chunks */
> @@ -230,149 +211,116 @@ $ua_chk16w: andi t8, a2, 0x3f /* any whole 64-byte chunks? */
> /* There will be at most 1 32-byte chunk after it */
> subu a3, a2, t8 /* subtract from a2 the reminder */
> /* Here a3 counts bytes in 16w chunks */
> - addu a3, a0, a3 /* Now a3 is the final dst after 64-byte chunks */
> + daddu a3, a0, a3 /* Now a3 is the final dst after 64-byte chunks */
>
> - addu t0, a0, a2 /* t0 is the "past the end" address */
> + daddu t0, a0, a2 /* t0 is the "past the end" address */
>
> subu t9, t0, 160 /* t9 is the "last safe pref 30, 128(a0)" address */
>
> - pref 0, 0(a1) /* bring the first line of src, addr 0 */
> - pref 0, 32(a1) /* bring the second line of src, addr 32 */
> - pref 0, 64(a1) /* bring the third line of src, addr 64 */
> - pref 30, 32(a0) /* safe, as we have at least 64 bytes ahead */
> + lw $0, 0(a1) /* bring the first line of src, addr 0 */
> + lw $0, 32(a1) /* bring the second line of src, addr 32 */
> + lw $0, 64(a1) /* bring the third line of src, addr 64 */
> + lw $0, 32(a0) /* safe, as we have at least 64 bytes ahead */
> /* In case the a0 > t9 don't use "pref 30" at all */
> sgtu v1, a0, t9
> bgtz v1, $ua_loop16w /* skip "pref 30, 64(a0)" for too short arrays */
> nop
> /* otherwise, start with using pref30 */
> - pref 30, 64(a0)
> + lw $0, 64(a0)
> $ua_loop16w:
> - pref 0, 96(a1)
> - LWHI t0, 0(a1)
> - LWLO t0, 3(a1)
> - LWHI t1, 4(a1)
> + lw $0, 96(a1)
> + ldl t0, 7(a1)
> + ldr t0, 0(a1)
> bgtz v1, $ua_skip_pref30_96
> - LWLO t1, 7(a1)
> - pref 30, 96(a0) /* continue setting up the dest, addr 96 */
> + lw $0, 96(a0) /* continue setting up the dest, addr 96 */
> $ua_skip_pref30_96:
> - LWHI t2, 8(a1)
> - LWLO t2, 11(a1)
> - LWHI t3, 12(a1)
> - LWLO t3, 15(a1)
> - LWHI t4, 16(a1)
> - LWLO t4, 19(a1)
> - LWHI t5, 20(a1)
> - LWLO t5, 23(a1)
> - LWHI t6, 24(a1)
> - LWLO t6, 27(a1)
> - LWHI t7, 28(a1)
> - LWLO t7, 31(a1)
> - pref 0, 128(a1) /* bring the next lines of src, addr 128 */
> -
> - sw t0, 0(a0)
> - sw t1, 4(a0)
> - sw t2, 8(a0)
> - sw t3, 12(a0)
> - sw t4, 16(a0)
> - sw t5, 20(a0)
> - sw t6, 24(a0)
> - sw t7, 28(a0)
> -
> - LWHI t0, 32(a1)
> - LWLO t0, 35(a1)
> - LWHI t1, 36(a1)
> + ldl t2, 15(a1)
> + ldr t2, 8(a1)
> + ldl t4, 23(a1)
> + ldr t4, 16(a1)
> + ldl t6, 31(a1)
> + ldr t6, 24(a1)
> + lw $0, 128(a1) /* bring the next lines of src, addr 128 */
> + lw $0, 0(a0)
> +
> + sd t0, 0(a0)
> + sd t2, 8(a0)
> + sd t4, 16(a0)
> + sd t6, 24(a0)
> +
> + ldl t0, 39(a1)
> + ldr t0, 32(a1)
> bgtz v1, $ua_skip_pref30_128
> - LWLO t1, 39(a1)
> - pref 30, 128(a0) /* continue setting up the dest, addr 128 */
> + lw $0, 128(a0) /* continue setting up the dest, addr 128 */
> $ua_skip_pref30_128:
> - LWHI t2, 40(a1)
> - LWLO t2, 43(a1)
> - LWHI t3, 44(a1)
> - LWLO t3, 47(a1)
> - LWHI t4, 48(a1)
> - LWLO t4, 51(a1)
> - LWHI t5, 52(a1)
> - LWLO t5, 55(a1)
> - LWHI t6, 56(a1)
> - LWLO t6, 59(a1)
> - LWHI t7, 60(a1)
> - LWLO t7, 63(a1)
> - pref 0, 160(a1) /* bring the next lines of src, addr 160 */
> -
> - sw t0, 32(a0)
> - sw t1, 36(a0)
> - sw t2, 40(a0)
> - sw t3, 44(a0)
> - sw t4, 48(a0)
> - sw t5, 52(a0)
> - sw t6, 56(a0)
> - sw t7, 60(a0)
> -
> - addiu a0, a0, 64 /* adding 64 to dest */
> + ldl t2, 47(a1)
> + ldr t2, 40(a1)
> + ldl t4, 55(a1)
> + ldr t4, 48(a1)
> + ldl t6, 63(a1)
> + ldr t6, 56(a1)
> + lw $0, 32(a0)
> + lw $0, 160(a1) /* bring the next lines of src, addr 160 */
> +
> + sd t0, 32(a0)
> + sd t2, 40(a0)
> + sd t4, 48(a0)
> + sd t6, 56(a0)
> +
> + daddiu a0, a0, 64 /* adding 64 to dest */
> sgtu v1, a0, t9
> bne a0, a3, $ua_loop16w
> - addiu a1, a1, 64 /* adding 64 to src */
> + daddiu a1, a1, 64 /* adding 64 to src */
> move a2, t8
>
> /* Here we have src and dest word-aligned but less than 64-bytes to go */
>
> $ua_chk8w:
> - pref 0, 0x0(a1)
> + lw $0, 0x0(a1)
> andi t8, a2, 0x1f /* is there a 32-byte chunk? */
> /* the t8 is the reminder count */
> beq a2, t8, $ua_chk1w /* when a2=t8, no 32-byte chunk */
>
> - LWHI t0, 0(a1)
> - LWLO t0, 3(a1)
> - LWHI t1, 4(a1)
> - LWLO t1, 7(a1)
> - LWHI t2, 8(a1)
> - LWLO t2, 11(a1)
> - LWHI t3, 12(a1)
> - LWLO t3, 15(a1)
> - LWHI t4, 16(a1)
> - LWLO t4, 19(a1)
> - LWHI t5, 20(a1)
> - LWLO t5, 23(a1)
> - LWHI t6, 24(a1)
> - LWLO t6, 27(a1)
> - LWHI t7, 28(a1)
> - LWLO t7, 31(a1)
> - addiu a1, a1, 32
> -
> - sw t0, 0(a0)
> - sw t1, 4(a0)
> - sw t2, 8(a0)
> - sw t3, 12(a0)
> - sw t4, 16(a0)
> - sw t5, 20(a0)
> - sw t6, 24(a0)
> - sw t7, 28(a0)
> - addiu a0, a0, 32
> + ldl t0, 7(a1)
> + ldr t0, 0(a1)
> + ldl t2, 15(a1)
> + ldr t2, 8(a1)
> + ldl t4, 23(a1)
> + ldr t4, 16(a1)
> + ldl t6, 31(a1)
> + ldr t6, 24(a1)
> + lw $0, 0x0(a0)
> + daddiu a1, a1, 32
> +
> + sd t0, 0(a0)
> + sd t2, 8(a0)
> + sd t4, 16(a0)
> + sd t6, 24(a0)
> + daddiu a0, a0, 32
>
> $ua_chk1w:
> andi a2, t8, 0x3 /* now a2 is the reminder past 1w chunks */
> beq a2, t8, $ua_smallCopy
> subu a3, t8, a2 /* a3 is count of bytes in 1w chunks */
> - addu a3, a0, a3 /* now a3 is the dst address past the 1w chunks */
> + daddu a3, a0, a3 /* now a3 is the dst address past the 1w chunks */
>
> /* copying in words (4-byte chunks) */
> $ua_wordCopy_loop:
> LWHI v1, 0(a1)
> LWLO v1, 3(a1)
> - addiu a1, a1, 4
> - addiu a0, a0, 4 /* note: dst=a0 is word aligned here, see NOTE1 */
> + daddiu a1, a1, 4
> + daddiu a0, a0, 4 /* note: dst=a0 is word aligned here, see NOTE1 */
> bne a0, a3, $ua_wordCopy_loop
> sw v1, -4(a0)
>
> /* Now less than 4 bytes (value in a2) left to copy */
> $ua_smallCopy:
> beqz a2, leave
> - addu a3, a0, a2 /* a3 is the last dst address */
> + daddu a3, a0, a2 /* a3 is the last dst address */
> $ua_smallCopy_loop:
> lb v1, 0(a1)
> - addiu a1, a1, 1
> - addiu a0, a0, 1
> + daddiu a1, a1, 1
> + daddiu a0, a0, 1
> bne a0, a3, $ua_smallCopy_loop
> sb v1, -1(a0)
>
> diff --git a/pixman/pixman-mmx.c b/pixman/pixman-mmx.c
> index dec3974..edbf16b 100644
> --- a/pixman/pixman-mmx.c
> +++ b/pixman/pixman-mmx.c
> @@ -59,6 +59,71 @@ _mm_empty (void)
> }
> #endif
>
> +#define COMBINE_A_OUT 1
> +#define COMBINE_A_IN 2
> +#define COMBINE_B_OUT 4
> +#define COMBINE_B_IN 8
> +
> +#define COMBINE_CLEAR 0
> +#define COMBINE_A (COMBINE_A_OUT | COMBINE_A_IN)
> +#define COMBINE_B (COMBINE_B_OUT | COMBINE_B_IN)
> +#define COMBINE_A_OVER (COMBINE_A_OUT | COMBINE_B_OUT | COMBINE_A_IN)
> +#define COMBINE_B_OVER (COMBINE_A_OUT | COMBINE_B_OUT | COMBINE_B_IN)
> +#define COMBINE_A_ATOP (COMBINE_B_OUT | COMBINE_A_IN)
> +#define COMBINE_B_ATOP (COMBINE_A_OUT | COMBINE_B_IN)
> +#define COMBINE_XOR (COMBINE_A_OUT | COMBINE_B_OUT)
> +
> +/* no SIMD instructions for div, so leave it alone
> + * portion covered by a but not b
> + * min (1, (1-b) / a)
> + */
> +static uint8_t
> +combine_disjoint_out_part (uint8_t a, uint8_t b)
> +{
> +
> + b = ~b;
> + if (b >= a)
> + return MASK;
> + return DIV_UN8 (b, a);
> +}
> +
> +/* portion covered by both a and b
> + * max (1-(1-b)/a, 0)
> + */
> +static uint8_t
> +combine_disjoint_in_part (uint8_t a, uint8_t b)
> +{
> +
> + b = ~b;
> + if (b >= a)
> + return 0;
> + return ~DIV_UN8(b, a);
> +}
> +
> +/* portion covered by a but not b
> + * max (1-b/a ,0)
> + * */
> +static uint8_t
> +combine_conjoint_out_part (uint8_t a, uint8_t b)
> +{
> +
> + if (b >= a)
> + return 0x00;
> + return ~DIV_UN8(b, a);
> +}
> +
> +/* portion covered by both a and b
> + * min (1, b/a)
> + */
> +static uint8_t
> +combine_conjoint_in_part (uint8_t a, uint8_t b)
> +{
> +
> + if (b >= a)
> + return MASK;
> + return DIV_UN8 (b, a);
> +}
> +
> #ifdef USE_X86_MMX
> # if (defined(__SUNPRO_C) || defined(_MSC_VER) || defined(_WIN64))
> # include <xmmintrin.h>
> @@ -78,7 +143,8 @@ _mm_movemask_pi8 (__m64 __A)
>
> return ret;
> }
> -
> +#define __OPTIMIZE__
> +#ifdef __OPTIMIZE__
> extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
> _mm_mulhi_pu16 (__m64 __A, __m64 __B)
> {
> @@ -88,7 +154,7 @@ _mm_mulhi_pu16 (__m64 __A, __m64 __B)
> );
> return __A;
> }
> -
> +#else
> # define _mm_shuffle_pi16(A, N) \
> ({ \
> __m64 ret; \
> @@ -102,7 +168,7 @@ _mm_mulhi_pu16 (__m64 __A, __m64 __B)
> })
> # endif
> #endif
> -
> +#endif
> #ifndef _MSC_VER
> #define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \
> (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0))
> @@ -710,6 +776,34 @@ combine (const uint32_t *src, const uint32_t *mask)
> return vsrc;
> }
>
> +static force_inline void
> +mmx_combine_mask_ca(const uint32_t *src, const uint32_t *mask, __m64 *s64, __m64 *m64)
> +{
> + __m64 res, tmp;
> +
> + if(!(*mask))
> + {
> + *s64 = 0;
> + *m64 = 0;
> + return;
> + }
> +
> + *s64 = load8888(src);
> +
> + if (*mask == ~0)
> + {
> + *m64 = expand_alpha(*s64);
> + return;
> + }
> +
> + *m64 = load8888(mask);
> +
> + res = pix_multiply(*s64, *m64);
> + tmp = expand_alpha(*s64);
> + *s64 = res;
> + *m64 = pix_multiply(*m64, tmp);
> +}
> +
> static force_inline __m64
> core_combine_over_u_pixel_mmx (__m64 vsrc, __m64 vdst)
> {
> @@ -729,6 +823,39 @@ core_combine_over_u_pixel_mmx (__m64 vsrc, __m64 vdst)
> }
>
> static void
> +mmx_combine_disjoint_over_u (pixman_implementation_t *imp,
> + pixman_op_t op,
> + uint32_t * dest,
> + const uint32_t * src,
> + const uint32_t * mask,
> + int width)
> +{
> + uint32_t *end = dest + width;
> + uint32_t s32;
> + uint64_t sa64;
> + __m64 s64, d64;
> +
> + while (dest < end)
> + {
> + s64 = combine (src, mask);
> +
> + if (s64)
> + {
> + store8888(&s32, s64);
> + sa64 = combine_disjoint_out_part (*dest >> A_SHIFT, s32 >> A_SHIFT);
> + d64 = pix_add (pix_multiply (load8888 (dest),expand_alpha_rev ((*(__m64*)&sa64))), s64);
> + store8888 (dest, d64);
> + }
> +
> + ++dest;
> + ++src;
> + if (mask)
> + ++mask;
> +
> + }
> +}
> +
> +static void
> mmx_combine_over_u (pixman_implementation_t *imp,
> pixman_op_t op,
> uint32_t * dest,
> @@ -1062,7 +1189,294 @@ mmx_combine_saturate_u (pixman_implementation_t *imp,
> }
> _mm_empty ();
> }
> +/* In functions such as ‘combine_conjoint_gerneral_u’, there are multiple branchs,determined by the parameter 'combine'.
> + * and this value will not change during functions operations,so it is not necessary to judge each value in the origin
> + * code. Can be judged at function entrance,and set the corresponding function pointer,can be called directly later.
> + */
> +#define DEF_FUNC_ZERO_MASK(type, zm, suffix, res) \
> + static type inline combine_joint_ ##zm## _ ##suffix( type sa, type da, type io_flag) \
> + { \
> + return res; \
> + }
> +
> +/* 'conjoint' is same code structure as 'disjoint',the funtion name is different,set this macro to generate the corresponding
> + * function.The order of parameter is different,which is determined by 'io_flag',with '0' for 'in_part' and '1' for 'out_part'.
> + */
> +#define DEF_FUNC_COMBINE_JOINT_U(cd, io) \
> + static uint8_t inline combine_ ##cd## joint_ ##io## _part_u(uint8_t sa, uint8_t da, uint8_t io_flag) \
> + { \
> + uint8_t parm[2]; \
> + parm[0] = sa * (io_flag ^ 0x1) + da * (io_flag ^ 0x0); \
> + parm[1] = sa * (io_flag ^ 0x0) + da * (io_flag ^ 0x1); \
> + return combine_ ##cd## joint_ ##io## _part (parm[0], parm[1]); \
> + }
> +/* Sets the macro for the array of function pointers, storing the correct handler at the function entrance */
> +#define DEF_COMB_FUNC_ARR(cd,SUFFIX,suffix) \
> + COMBINE_JOINT_FUNC_##SUFFIX combine_ ##cd## joint_ ##suffix[4] ={ \
> + combine_joint_zero_ ##suffix, \
> + combine_ ##cd## joint_out_part_ ##suffix, \
> + combine_ ##cd## joint_in_part_ ##suffix, \
> + combine_joint_mask_ ##suffix \
> + };
> +
> +typedef uint8_t (*COMBINE_JOINT_FUNC_U)(uint8_t a, uint8_t b, uint8_t io_flag);
> +
> +DEF_FUNC_ZERO_MASK(uint8_t,zero,u, 0x0)
> +DEF_FUNC_ZERO_MASK(uint8_t,mask,u, ~0x0)
> +
> +DEF_FUNC_COMBINE_JOINT_U(dis, in);
> +DEF_FUNC_COMBINE_JOINT_U(dis, out);
> +DEF_COMB_FUNC_ARR(dis,U,u)
> +
> +DEF_FUNC_COMBINE_JOINT_U(con, in);
> +DEF_FUNC_COMBINE_JOINT_U(con, out);
> +DEF_COMB_FUNC_ARR(con, U, u)
> +/* Set an underlying function,'conjoint' and 'disjoint' related functions can be called. */
> +static void
> +mmx_combine_joint_general_u (uint32_t * dest,
> + const uint32_t *src,
> + const uint32_t *mask,
> + int width,
> + uint8_t comb,
> + COMBINE_JOINT_FUNC_U *cjf)
> +{
> + COMBINE_JOINT_FUNC_U combine_joint_u[2];
> + combine_joint_u[0] = cjf[comb & COMBINE_A]; /* in_part */
> + combine_joint_u[1] = cjf[(comb & COMBINE_B)>>2]; /* out_par */
> +
> + uint32_t *end = dest + width;
> + while (dest < end)
> + {
> + __m64 s64 = combine (src, mask);
> + __m64 d64,sa64,da64;
> + uint8_t sa, da;
> + uint32_t tmp;
> + uint64_t Fa, Fb;
> +
> + /* Because these function contain division instructions,
> + * multimedia instruction are not used to optimize them.
> + */
> + store8888(&tmp, s64);
> + sa = tmp >> A_SHIFT;
> + da = *dest >> A_SHIFT;
> +
> + Fa = combine_joint_u[0](sa, da, 0);
> + Fb = combine_joint_u[1](sa, da, 1);
> +
> + d64 = load8888(dest);
> + sa64 = expand_alpha_rev (*(__m64*)&Fa);
> + da64 = expand_alpha_rev (*(__m64*)&Fb);
> +
> + d64 = pix_add_mul (s64, sa64, d64, da64);
> +
> + store8888 (dest, d64);
> +
> + ++dest;
> + ++src;
> + if (mask)
> + ++mask;
> + }
> +}
> +
> +
> +static void
> +mmx_combine_disjoint_general_u (uint32_t * dest,
> + const uint32_t *src,
> + const uint32_t *mask,
> + int width,
> + uint8_t comb)
> +{
> + mmx_combine_joint_general_u (dest, src, mask, width, comb, combine_disjoint_u);
> +}
> +
> +static void
> +mmx_combine_disjoint_in_u (pixman_implementation_t *imp,
> + pixman_op_t op,
> + uint32_t * dest,
> + const uint32_t * src,
> + const uint32_t * mask,
> + int width)
> +{
> + mmx_combine_disjoint_general_u (dest, src, mask, width, COMBINE_A_IN);
> +}
> +
> +static void
> +mmx_combine_disjoint_in_reverse_u (pixman_implementation_t *imp,
> + pixman_op_t op,
> + uint32_t * dest,
> + const uint32_t * src,
> + const uint32_t * mask,
> + int width)
> +{
> + mmx_combine_disjoint_general_u (dest, src, mask, width, COMBINE_B_IN);
> +}
> +
> +static void
> +mmx_combine_disjoint_out_u (pixman_implementation_t *imp,
> + pixman_op_t op,
> + uint32_t * dest,
> + const uint32_t * src,
> + const uint32_t * mask,
> + int width)
> +{
> + mmx_combine_disjoint_general_u (dest, src, mask, width, COMBINE_A_OUT);
> +}
> +
> +static void
> +mmx_combine_disjoint_out_reverse_u (pixman_implementation_t *imp,
> + pixman_op_t op,
> + uint32_t * dest,
> + const uint32_t * src,
> + const uint32_t * mask,
> + int width)
> +{
> + mmx_combine_disjoint_general_u (dest, src, mask, width, COMBINE_B_OUT);
> +}
> +
> +static void
> +mmx_combine_disjoint_atop_u (pixman_implementation_t *imp,
> + pixman_op_t op,
> + uint32_t * dest,
> + const uint32_t * src,
> + const uint32_t * mask,
> + int width)
> +{
> + mmx_combine_disjoint_general_u (dest, src, mask, width, COMBINE_A_ATOP);
> +}
> +
> +static void
> +mmx_combine_disjoint_atop_reverse_u (pixman_implementation_t *imp,
> + pixman_op_t op,
> + uint32_t * dest,
> + const uint32_t * src,
> + const uint32_t * mask,
> + int width)
> +{
> + mmx_combine_disjoint_general_u (dest, src, mask, width, COMBINE_B_ATOP);
> +}
> +
> +static void
> +mmx_combine_disjoint_xor_u (pixman_implementation_t *imp,
> + pixman_op_t op,
> + uint32_t * dest,
> + const uint32_t * src,
> + const uint32_t * mask,
> + int width)
> +{
> + mmx_combine_disjoint_general_u (dest, src, mask, width, COMBINE_XOR);
> +}
> +
> +/* Conjoint */
> +static void
> +mmx_combine_conjoint_general_u(uint32_t * dest,
> + const uint32_t *src,
> + const uint32_t *mask,
> + int width,
> + uint8_t comb)
> +{
> + mmx_combine_joint_general_u (dest, src, mask, width, comb, combine_conjoint_u);
> +}
> +
> +static void
> +mmx_combine_conjoint_over_u (pixman_implementation_t *imp,
> + pixman_op_t op,
> + uint32_t * dest,
> + const uint32_t * src,
> + const uint32_t * mask,
> + int width)
> +{
> + mmx_combine_conjoint_general_u (dest, src, mask, width, COMBINE_A_OVER);
> +}
> +
> +static void
> +mmx_combine_conjoint_over_reverse_u (pixman_implementation_t *imp,
> + pixman_op_t op,
> + uint32_t * dest,
> + const uint32_t * src,
> + const uint32_t * mask,
> + int width)
> +{
> + mmx_combine_conjoint_general_u (dest, src, mask, width, COMBINE_B_OVER);
> +}
> +
> +static void
> +mmx_combine_conjoint_in_u (pixman_implementation_t *imp,
> + pixman_op_t op,
> + uint32_t * dest,
> + const uint32_t * src,
> + const uint32_t * mask,
> + int width)
> +{
> + mmx_combine_conjoint_general_u (dest, src, mask, width, COMBINE_A_IN);
> +}
> +
> +static void
> +mmx_combine_conjoint_in_reverse_u (pixman_implementation_t *imp,
> + pixman_op_t op,
> + uint32_t * dest,
> + const uint32_t * src,
> + const uint32_t * mask,
> + int width)
> +{
> + mmx_combine_conjoint_general_u (dest, src, mask, width, COMBINE_B_IN);
> +}
> +
> +static void
> +mmx_combine_conjoint_out_u (pixman_implementation_t *imp,
> + pixman_op_t op,
> + uint32_t * dest,
> + const uint32_t * src,
> + const uint32_t * mask,
> + int width)
> +{
> + mmx_combine_conjoint_general_u (dest, src, mask, width, COMBINE_A_OUT);
> +}
> +
> +static void
> +mmx_combine_conjoint_out_reverse_u (pixman_implementation_t *imp,
> + pixman_op_t op,
> + uint32_t * dest,
> + const uint32_t * src,
> + const uint32_t * mask,
> + int width)
> +{
> + mmx_combine_conjoint_general_u (dest, src, mask, width, COMBINE_B_OUT);
> +}
> +
> +static void
> +mmx_combine_conjoint_atop_u (pixman_implementation_t *imp,
> + pixman_op_t op,
> + uint32_t * dest,
> + const uint32_t * src,
> + const uint32_t * mask,
> + int width)
> +{
> + mmx_combine_conjoint_general_u (dest, src, mask, width, COMBINE_A_ATOP);
> +}
> +
> +static void
> +mmx_combine_conjoint_atop_reverse_u (pixman_implementation_t *imp,
> + pixman_op_t op,
> + uint32_t * dest,
> + const uint32_t * src,
> + const uint32_t * mask,
> + int width)
> +{
> + mmx_combine_conjoint_general_u (dest, src, mask, width, COMBINE_B_ATOP);
> +}
> +
> +static void
> +mmx_combine_conjoint_xor_u (pixman_implementation_t *imp,
> + pixman_op_t op,
> + uint32_t * dest,
> + const uint32_t * src,
> + const uint32_t * mask,
> + int width)
> +{
> + mmx_combine_conjoint_general_u (dest, src, mask, width, COMBINE_XOR);
> +}
>
> +/* Component alpha combiners */
> static void
> mmx_combine_src_ca (pixman_implementation_t *imp,
> pixman_op_t op,
> @@ -1089,6 +1503,410 @@ mmx_combine_src_ca (pixman_implementation_t *imp,
> }
>
> static void
> +mmx_combine_saturate_ca (pixman_implementation_t *imp,
> + pixman_op_t op,
> + uint32_t * dest,
> + const uint32_t * src,
> + const uint32_t * mask,
> + int width)
> +{
> + uint32_t *end = dest + width;
> + while (dest < end)
> + {
> + uint16_t sa, sr, sg, sb;
> + uint32_t sa32, m32;
> + __m64 m64, s64, d64, sa64, da64, cmpf, res;
> +
> + mmx_combine_mask_ca (src, mask, &s64, &m64);
> +
> + d64 = load8888 (dest);
> + da64 = expand_alpha (negate(d64));
> + cmpf = _mm_cmpgt_pi16 (m64, da64);
> + if (cmpf)
> + {
> + store8888 (&m32, m64);
> + sa = (m32 >> (A_SHIFT));
> + sr = (m32 >> (R_SHIFT)) & MASK;
> + sg = (m32 >> (G_SHIFT)) & MASK;
> + sb = m32 & MASK;
> + sa32 = (~(*dest) >> A_SHIFT) & MASK;
> +
> + sa = (sa) ? sa : 0x1;
> + sr = (sr) ? sr : 0x1;
> + sg = (sg) ? sg : 0x1;
> + sb = (sb) ? sb : 0x1;
> +
> + sa32 = ((sa32 << G_SHIFT) / sb & MASK) |
> + ((((sa32 << G_SHIFT) / sg) & MASK) << G_SHIFT) |
> + ((((sa32 << G_SHIFT) / sr) & MASK) << R_SHIFT) |
> + ((((sa32 << G_SHIFT) / sa) & MASK) << A_SHIFT);
> + sa64 = load8888 (&sa32);
> + da64 = MC (4x00ff);
> + res = pix_multiply (s64, sa64);
> + s64 = _mm_or_si64 (_mm_and_si64 (res, cmpf), _mm_and_si64 (s64, negate (cmpf)));
> + res = pix_multiply (d64, da64);
> + d64 = _mm_or_si64 (_mm_and_si64 (res, cmpf), _mm_and_si64 (d64, negate (cmpf)));
> + }
> + res = _mm_adds_pu8 (s64, d64);
> + store8888 (dest, res);
> +
> + ++dest;
> + ++src;
> + if (mask)
> + ++mask;
> + }
> +}
> +
> +#define DEF_FUNC_COMBINE_JOINT_CA(cd, io) \
> + static uint32_t inline combine_ ##cd## joint_ ##io## _part_ca(uint32_t sa, uint32_t da, uint32_t io_flag) \
> + { \
> + uint8_t da8 = da >> A_SHIFT; \
> + uint32_t m, n, o, p, res; \
> + uint8_t i, parm[2][4], shift=0; \
> + for (i=0; i<4; i++) \
> + { \
> + parm[0][i] = (uint8_t)(sa>>shift) * (io_flag ^ 0x1) + da8 * (io_flag ^ 0x0); \
> + parm[1][i] = (uint8_t)(sa>>shift) * (io_flag ^ 0x0) + da8 * (io_flag ^ 0x1); \
> + shift += G_SHIFT; \
> + } \
> + m = (uint32_t)combine_ ##cd## joint_ ##io## _part (parm[0][0], parm[1][0]); \
> + n = (uint32_t)combine_ ##cd## joint_ ##io## _part (parm[0][1], parm[1][1]) << G_SHIFT; \
> + o = (uint32_t)combine_ ##cd## joint_ ##io## _part (parm[0][2], parm[1][2]) << R_SHIFT; \
> + p = (uint32_t)combine_ ##cd## joint_ ##io## _part (parm[0][3], parm[1][3]) << A_SHIFT; \
> + res = m | n | o | p; \
> + return res; \
> + }
> +
> +typedef uint32_t (*COMBINE_JOINT_FUNC_CA)(uint32_t sa, uint32_t da, uint32_t io_flag);
> +
> +DEF_FUNC_ZERO_MASK(uint32_t, zero, ca, 0x0)
> +DEF_FUNC_ZERO_MASK(uint32_t, mask, ca, ~0x0)
> +
> +DEF_FUNC_COMBINE_JOINT_CA(dis, in);
> +DEF_FUNC_COMBINE_JOINT_CA(dis, out);
> +DEF_COMB_FUNC_ARR(dis, CA, ca)
> +
> +DEF_FUNC_COMBINE_JOINT_CA(con, in);
> +DEF_FUNC_COMBINE_JOINT_CA(con, out);
> +DEF_COMB_FUNC_ARR(con, CA, ca)
> +
> +static void
> +mmx_combine_joint_general_ca (uint32_t * dest,
> + const uint32_t *src,
> + const uint32_t *mask,
> + int width,
> + uint8_t comb,
> + COMBINE_JOINT_FUNC_CA *cjf)
> +{
> + COMBINE_JOINT_FUNC_CA combine_joint_ca[2];
> + combine_joint_ca[0] = cjf[comb & COMBINE_A];
> + combine_joint_ca[1] = cjf[(comb & COMBINE_B)>>2];
> +
> + uint32_t *end = dest + width;
> + while (dest < end)
> + {
> + __m64 m64, s64, sa64, da64, d64;
> + uint32_t m32, Fa, Fb;
> +
> + mmx_combine_mask_ca (src, mask, &s64, &m64);
> + store8888(&m32, m64);
> +
> + Fa = combine_joint_ca[0](m32, *dest, 0);
> + Fb = combine_joint_ca[1](m32, *dest, 1);
> +
> + sa64 = load8888 (&Fa);
> + da64 = load8888 (&Fb);
> +
> + d64 = load8888 (dest);
> + d64 = pix_add_mul(s64, sa64, d64, da64);
> +
> + store8888 (dest, d64);
> +
> + ++dest;
> + ++src;
> + if (mask)
> + ++mask;
> + }
> +
> +}
> +
> +static void
> +mmx_combine_disjoint_general_ca (uint32_t * dest,
> + const uint32_t *src,
> + const uint32_t *mask,
> + int width,
> + uint8_t comb)
> +{
> + mmx_combine_joint_general_ca (dest, src, mask, width, comb, combine_disjoint_ca);
> +}
> +
> +static void
> +mmx_combine_disjoint_over_ca (pixman_implementation_t *imp,
> + pixman_op_t op,
> + uint32_t * dest,
> + const uint32_t * src,
> + const uint32_t * mask,
> + int width)
> +{
> + mmx_combine_disjoint_general_ca (dest, src, mask, width, COMBINE_A_OVER);
> +}
> +
> +static void
> +mmx_combine_disjoint_in_ca (pixman_implementation_t *imp,
> + pixman_op_t op,
> + uint32_t * dest,
> + const uint32_t * src,
> + const uint32_t * mask,
> + int width)
> +{
> + mmx_combine_disjoint_general_ca (dest, src, mask, width, COMBINE_A_IN);
> +}
> +
> +static void
> +mmx_combine_disjoint_in_reverse_ca (pixman_implementation_t *imp,
> + pixman_op_t op,
> + uint32_t * dest,
> + const uint32_t * src,
> + const uint32_t * mask,
> + int width)
> +{
> + mmx_combine_disjoint_general_ca (dest, src, mask, width, COMBINE_B_IN);
> +}
> +
> +static void
> +mmx_combine_disjoint_out_ca (pixman_implementation_t *imp,
> + pixman_op_t op,
> + uint32_t * dest,
> + const uint32_t * src,
> + const uint32_t * mask,
> + int width)
> +{
> + mmx_combine_disjoint_general_ca (dest, src, mask, width, COMBINE_A_OUT);
> +}
> +
> +static void
> +mmx_combine_disjoint_out_reverse_ca (pixman_implementation_t *imp,
> + pixman_op_t op,
> + uint32_t * dest,
> + const uint32_t * src,
> + const uint32_t * mask,
> + int width)
> +{
> + mmx_combine_disjoint_general_ca (dest, src, mask, width, COMBINE_B_OUT);
> +}
> +
> +static void
> +mmx_combine_disjoint_atop_ca (pixman_implementation_t *imp,
> + pixman_op_t op,
> + uint32_t * dest,
> + const uint32_t * src,
> + const uint32_t * mask,
> +
More information about the Pixman
mailing list