[Pixman] [PATCH] mmx: compile on MIPS for Loongson-3A MMI optimizations
xianjudiao at gmail.com
xianjudiao at gmail.com
Tue Sep 18 09:33:26 UTC 2018
From: Xianju Diao <xianjudiao at gmail.com>
make check:
when I enable the USE_OPENMP, the test of 'glyph-test' and 'cover-test' will failed on Loongson-3A3000.
Neither of the two test examples passed without optimizing the code.Maybe be multi-core synchronization
of cpu bug,I will continue to debug this problem, Now, I use the critical of openMP, 'glyph-test' and '
cover-test' can passed.
benchmark:
Running cairo-perf-trace benchmark on Loongson-3A.
image image16
gvim 5.425 -> 5.069 5.531 -> 5.236
popler-reseau 2.149 -> 2.13 2.152 -> 2.139
swfdec-giant-steps-full 18.672 -> 8.215 33.167 -> 18.28
swfdec-giant-steps 7.014 -> 2.455 12.48 -> 5.982
xfce4-terminal-al 13.695 -> 5.241 15.703 -> 5.859
gonme-system-monitor 12.783 -> 7.058 12.780 -> 7.104
grads-heat-map 0.482 -> 0.486 0.516 -> 0.514
firefox-talos-svg 141.138 -> 134.621 152.495 -> 159.069
firefox-talos-gfx 23.119 -> 14.437 24.870 -> 15.161
firefox-world-map 32.018 -> 27.139 33.817 -> 28.085
firefox-periodic-table 12.305 -> 12.443 12.876 -> 12.913
evolution 7.071 -> 3.564 8.550 -> 3.784
firefox-planet-gnome 77.926 -> 67.526 81.554 -> 65.840
ocitysmap 4.934 -> 1.702 4.937 -> 1.701
---
configure.ac | 7 +-
pixman/Makefile.am | 4 +-
pixman/loongson-mmintrin.h | 46 ++
pixman/pixman-combine32.h | 6 +
pixman/pixman-mips-dspr2-asm.h | 2 +-
pixman/pixman-mips-memcpy-asm.S | 324 +++++-------
pixman/pixman-mmx.c | 1088 ++++++++++++++++++++++++++++++++++++++-
pixman/pixman-private.h | 32 +-
pixman/pixman-solid-fill.c | 49 +-
pixman/pixman-utils.c | 65 ++-
test/Makefile.am | 2 +-
test/utils.c | 8 +
12 files changed, 1418 insertions(+), 215 deletions(-)
diff --git a/configure.ac b/configure.ac
index e833e45..3e3dde5 100644
--- a/configure.ac
+++ b/configure.ac
@@ -154,9 +154,9 @@ AC_CHECK_DECL([__amd64], [AMD64_ABI="yes"], [AMD64_ABI="no"])
# has set CFLAGS.
if test $SUNCC = yes && \
test "x$test_CFLAGS" = "x" && \
- test "$CFLAGS" = "-g"
+ test "$CFLAGS" = "-g -mabi=n64"
then
- CFLAGS="-O -g"
+ CFLAGS="-O -g -mabi=n64"
fi
#
@@ -183,6 +183,7 @@ AC_SUBST(LT_VERSION_INFO)
# Check for dependencies
PIXMAN_CHECK_CFLAG([-Wall])
+PIXMAN_CHECK_CFLAG([-mabi=n64])
PIXMAN_CHECK_CFLAG([-Wdeclaration-after-statement])
PIXMAN_CHECK_CFLAG([-Wno-unused-local-typedefs])
PIXMAN_CHECK_CFLAG([-fno-strict-aliasing])
@@ -273,7 +274,7 @@ dnl ===========================================================================
dnl Check for Loongson Multimedia Instructions
if test "x$LS_CFLAGS" = "x" ; then
- LS_CFLAGS="-march=loongson2f"
+ LS_CFLAGS="-march=loongson3a"
fi
have_loongson_mmi=no
diff --git a/pixman/Makefile.am b/pixman/Makefile.am
index 581b6f6..e3a080c 100644
--- a/pixman/Makefile.am
+++ b/pixman/Makefile.am
@@ -122,7 +122,7 @@ libpixman_mips_dspr2_la_SOURCES = \
pixman-mips-dspr2.h \
pixman-mips-dspr2-asm.S \
pixman-mips-dspr2-asm.h \
- pixman-mips-memcpy-asm.S
+ #pixman-mips-memcpy-asm.S
libpixman_1_la_LIBADD += libpixman-mips-dspr2.la
ASM_CFLAGS_mips_dspr2=
@@ -131,7 +131,7 @@ endif
# loongson code
if USE_LOONGSON_MMI
noinst_LTLIBRARIES += libpixman-loongson-mmi.la
-libpixman_loongson_mmi_la_SOURCES = pixman-mmx.c loongson-mmintrin.h
+libpixman_loongson_mmi_la_SOURCES = pixman-mmx.c loongson-mmintrin.h pixman-mips-memcpy-asm.S
libpixman_loongson_mmi_la_CFLAGS = $(LS_CFLAGS)
libpixman_1_la_LDFLAGS += $(LS_LDFLAGS)
libpixman_1_la_LIBADD += libpixman-loongson-mmi.la
diff --git a/pixman/loongson-mmintrin.h b/pixman/loongson-mmintrin.h
index 086c6e0..f049463 100644
--- a/pixman/loongson-mmintrin.h
+++ b/pixman/loongson-mmintrin.h
@@ -89,6 +89,17 @@ _mm_adds_pu8 (__m64 __m1, __m64 __m2)
}
extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_andn_si64 (__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+ asm("pandn %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f"(__m2)
+ );
+ return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_and_si64 (__m64 __m1, __m64 __m2)
{
__m64 ret;
@@ -100,6 +111,17 @@ _mm_and_si64 (__m64 __m1, __m64 __m2)
}
extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpeq_pi16 (__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+ asm("pcmpeqh %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+ return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpeq_pi32 (__m64 __m1, __m64 __m2)
{
__m64 ret;
@@ -110,6 +132,30 @@ _mm_cmpeq_pi32 (__m64 __m1, __m64 __m2)
return ret;
}
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+loongson_fand (__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+ asm("fand %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+ return ret;
+}
+
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpgt_pi16 (__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+ asm("pcmpgth %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+ return ret;
+}
+
+
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_empty (void)
{
diff --git a/pixman/pixman-combine32.h b/pixman/pixman-combine32.h
index cdd56a6..27f62d9 100644
--- a/pixman/pixman-combine32.h
+++ b/pixman/pixman-combine32.h
@@ -14,6 +14,12 @@
#define RB_ONE_HALF 0x800080
#define RB_MASK_PLUS_ONE 0x10000100
+#define RGB_MASK 0xffffff
+#define RGB_DMASK 0xffffffffffffULL
+#define R_DMASK 0x0000ffff00000000ULL
+#define G_DMASK 0x00000000ffff0000ULL
+#define B_DMASK 0x000000000000ffffULL
+
#define ALPHA_8(x) ((x) >> A_SHIFT)
#define RED_8(x) (((x) >> R_SHIFT) & MASK)
#define GREEN_8(x) (((x) >> G_SHIFT) & MASK)
diff --git a/pixman/pixman-mips-dspr2-asm.h b/pixman/pixman-mips-dspr2-asm.h
index e238566..63d7d96 100644
--- a/pixman/pixman-mips-dspr2-asm.h
+++ b/pixman/pixman-mips-dspr2-asm.h
@@ -77,7 +77,7 @@
.ent symbol, 0; \
symbol: .frame sp, 0, ra; \
.set push; \
- .set arch=mips32r2; \
+ .set arch=mips64r2; \
.set noreorder; \
.set noat;
diff --git a/pixman/pixman-mips-memcpy-asm.S b/pixman/pixman-mips-memcpy-asm.S
index 9ad6da5..a140191 100644
--- a/pixman/pixman-mips-memcpy-asm.S
+++ b/pixman/pixman-mips-memcpy-asm.S
@@ -54,19 +54,20 @@ LEAF_MIPS32R2(pixman_mips_fast_memcpy)
/* Test if the src and dst are word-aligned, or can be made word-aligned */
xor t8, a1, a0
- andi t8, t8, 0x3 /* t8 is a0/a1 word-displacement */
+ andi t8, t8, 0x7 /* t8 is a0/a1 word-displacement */
bne t8, zero, $unaligned
negu a3, a0
- andi a3, a3, 0x3 /* we need to copy a3 bytes to make a0/a1 aligned */
+ andi a3, a3, 0x7 /* we need to copy a3 bytes to make a0/a1 aligned */
beq a3, zero, $chk16w /* when a3=0 then the dst (a0) is word-aligned */
subu a2, a2, a3 /* now a2 is the remining bytes count */
- LWHI t8, 0(a1)
- addu a1, a1, a3
- SWHI t8, 0(a0)
- addu a0, a0, a3
+ ld t8, 0(a1)
+ daddu a1, a1, a3
+ sdl t8, 7(a0)
+ sdr t8, 0(a0)
+ daddu a0, a0, a3
/* Now the dst/src are mutually word-aligned with word-aligned addresses */
$chk16w: andi t8, a2, 0x3f /* any whole 64-byte chunks? */
@@ -76,9 +77,9 @@ $chk16w: andi t8, a2, 0x3f /* any whole 64-byte chunks? */
/* There will be at most 1 32-byte chunk after it */
subu a3, a2, t8 /* subtract from a2 the reminder */
/* Here a3 counts bytes in 16w chunks */
- addu a3, a0, a3 /* Now a3 is the final dst after 64-byte chunks */
+ daddu a3, a0, a3 /* Now a3 is the final dst after 64-byte chunks */
- addu t0, a0, a2 /* t0 is the "past the end" address */
+ daddu t0, a0, a2 /* t0 is the "past the end" address */
/*
* When in the loop we exercise "pref 30, x(a0)", the a0+x should not be past
@@ -89,119 +90,98 @@ $chk16w: andi t8, a2, 0x3f /* any whole 64-byte chunks? */
*/
subu t9, t0, 160 /* t9 is the "last safe pref 30, 128(a0)" address */
- pref 0, 0(a1) /* bring the first line of src, addr 0 */
- pref 0, 32(a1) /* bring the second line of src, addr 32 */
- pref 0, 64(a1) /* bring the third line of src, addr 64 */
- pref 30, 32(a0) /* safe, as we have at least 64 bytes ahead */
+ lw $0, 0(a1) /* bring the first line of src, addr 0 */
+ lw $0, 32(a1) /* bring the second line of src, addr 32 */
+ lw $0, 64(a1) /* bring the third line of src, addr 64 */
+ lw $0, 32(a0) /* safe, as we have at least 64 bytes ahead */
/* In case the a0 > t9 don't use "pref 30" at all */
sgtu v1, a0, t9
bgtz v1, $loop16w /* skip "pref 30, 64(a0)" for too short arrays */
nop
/* otherwise, start with using pref30 */
- pref 30, 64(a0)
+ lw $0, 64(a0)
$loop16w:
- pref 0, 96(a1)
- lw t0, 0(a1)
+ lw $0, 96(a1)
+ ld t0, 0(a1)
bgtz v1, $skip_pref30_96 /* skip "pref 30, 96(a0)" */
- lw t1, 4(a1)
- pref 30, 96(a0) /* continue setting up the dest, addr 96 */
+ lw $0, 96(a0) /* continue setting up the dest, addr 96 */
$skip_pref30_96:
- lw t2, 8(a1)
- lw t3, 12(a1)
- lw t4, 16(a1)
- lw t5, 20(a1)
- lw t6, 24(a1)
- lw t7, 28(a1)
- pref 0, 128(a1) /* bring the next lines of src, addr 128 */
-
- sw t0, 0(a0)
- sw t1, 4(a0)
- sw t2, 8(a0)
- sw t3, 12(a0)
- sw t4, 16(a0)
- sw t5, 20(a0)
- sw t6, 24(a0)
- sw t7, 28(a0)
-
- lw t0, 32(a1)
+ ld t2, 8(a1)
+ ld t4, 16(a1)
+ ld t6, 24(a1)
+ lw $0, 128(a1) /* bring the next lines of src, addr 128 */
+ lw $0, 0x0(a0)
+
+ sd t0, 0(a0)
+ sd t2, 8(a0)
+ sd t4, 16(a0)
+ sd t6, 24(a0)
+
+ ld t0, 32(a1)
bgtz v1, $skip_pref30_128 /* skip "pref 30, 128(a0)" */
- lw t1, 36(a1)
- pref 30, 128(a0) /* continue setting up the dest, addr 128 */
+ lw $0, 128(a0) /* continue setting up the dest, addr 128 */
$skip_pref30_128:
- lw t2, 40(a1)
- lw t3, 44(a1)
- lw t4, 48(a1)
- lw t5, 52(a1)
- lw t6, 56(a1)
- lw t7, 60(a1)
- pref 0, 160(a1) /* bring the next lines of src, addr 160 */
-
- sw t0, 32(a0)
- sw t1, 36(a0)
- sw t2, 40(a0)
- sw t3, 44(a0)
- sw t4, 48(a0)
- sw t5, 52(a0)
- sw t6, 56(a0)
- sw t7, 60(a0)
-
- addiu a0, a0, 64 /* adding 64 to dest */
+ ld t2, 40(a1)
+ ld t4, 48(a1)
+ ld t6, 56(a1)
+ lw $0, 160(a1) /* bring the next lines of src, addr 160 */
+ lw $0, 0x32(a0)
+
+ sd t0, 32(a0)
+ sd t2, 40(a0)
+ sd t4, 48(a0)
+ sd t6, 56(a0)
+
+ daddiu a0, a0, 64 /* adding 64 to dest */
sgtu v1, a0, t9
bne a0, a3, $loop16w
- addiu a1, a1, 64 /* adding 64 to src */
+ daddiu a1, a1, 64 /* adding 64 to src */
move a2, t8
/* Here we have src and dest word-aligned but less than 64-bytes to go */
$chk8w:
- pref 0, 0x0(a1)
+ lw $0, 0x0(a1)
andi t8, a2, 0x1f /* is there a 32-byte chunk? */
/* the t8 is the reminder count past 32-bytes */
beq a2, t8, $chk1w /* when a2=t8, no 32-byte chunk */
nop
- lw t0, 0(a1)
- lw t1, 4(a1)
- lw t2, 8(a1)
- lw t3, 12(a1)
- lw t4, 16(a1)
- lw t5, 20(a1)
- lw t6, 24(a1)
- lw t7, 28(a1)
- addiu a1, a1, 32
-
- sw t0, 0(a0)
- sw t1, 4(a0)
- sw t2, 8(a0)
- sw t3, 12(a0)
- sw t4, 16(a0)
- sw t5, 20(a0)
- sw t6, 24(a0)
- sw t7, 28(a0)
- addiu a0, a0, 32
+ ld t0, 0(a1)
+ ld t2, 8(a1)
+ ld t4, 16(a1)
+ ld t6, 24(a1)
+ lw $0, 0x0(a0)
+ daddiu a1, a1, 32
+
+ sd t0, 0(a0)
+ sd t2, 8(a0)
+ sd t4, 16(a0)
+ sd t6, 24(a0)
+ daddiu a0, a0, 32
$chk1w:
andi a2, t8, 0x3 /* now a2 is the reminder past 1w chunks */
beq a2, t8, $last8
subu a3, t8, a2 /* a3 is count of bytes in 1w chunks */
- addu a3, a0, a3 /* now a3 is the dst address past the 1w chunks */
+ daddu a3, a0, a3 /* now a3 is the dst address past the 1w chunks */
/* copying in words (4-byte chunks) */
$wordCopy_loop:
lw t3, 0(a1) /* the first t3 may be equal t0 ... optimize? */
- addiu a1, a1, 4
- addiu a0, a0, 4
+ daddiu a1, a1, 4
+ daddiu a0, a0, 4
bne a0, a3, $wordCopy_loop
sw t3, -4(a0)
/* For the last (<8) bytes */
$last8:
blez a2, leave
- addu a3, a0, a2 /* a3 is the last dst address */
+ daddu a3, a0, a2 /* a3 is the last dst address */
$last8loop:
lb v1, 0(a1)
- addiu a1, a1, 1
- addiu a0, a0, 1
+ daddiu a1, a1, 1
+ daddiu a0, a0, 1
bne a0, a3, $last8loop
sb v1, -1(a0)
@@ -214,15 +194,16 @@ leave: j ra
$unaligned:
/* got here with a3="negu a0" */
- andi a3, a3, 0x3 /* test if the a0 is word aligned */
+ andi a3, a3, 0x7 /* test if the a0 is word aligned */
beqz a3, $ua_chk16w
subu a2, a2, a3 /* bytes left after initial a3 bytes */
- LWHI v1, 0(a1)
- LWLO v1, 3(a1)
- addu a1, a1, a3 /* a3 may be here 1, 2 or 3 */
- SWHI v1, 0(a0)
- addu a0, a0, a3 /* below the dst will be word aligned (NOTE1) */
+ ldl v1, 7(a1)
+ ldr v1, 0(a1)
+ daddu a1, a1, a3 /* a3 may be here 1, 2 or 3 */
+ sdl v1, 7(a0)
+ sdr v1, 0(a0)
+ daddu a0, a0, a3 /* below the dst will be word aligned (NOTE1) */
$ua_chk16w: andi t8, a2, 0x3f /* any whole 64-byte chunks? */
/* t8 is the byte count after 64-byte chunks */
@@ -230,149 +211,116 @@ $ua_chk16w: andi t8, a2, 0x3f /* any whole 64-byte chunks? */
/* There will be at most 1 32-byte chunk after it */
subu a3, a2, t8 /* subtract from a2 the reminder */
/* Here a3 counts bytes in 16w chunks */
- addu a3, a0, a3 /* Now a3 is the final dst after 64-byte chunks */
+ daddu a3, a0, a3 /* Now a3 is the final dst after 64-byte chunks */
- addu t0, a0, a2 /* t0 is the "past the end" address */
+ daddu t0, a0, a2 /* t0 is the "past the end" address */
subu t9, t0, 160 /* t9 is the "last safe pref 30, 128(a0)" address */
- pref 0, 0(a1) /* bring the first line of src, addr 0 */
- pref 0, 32(a1) /* bring the second line of src, addr 32 */
- pref 0, 64(a1) /* bring the third line of src, addr 64 */
- pref 30, 32(a0) /* safe, as we have at least 64 bytes ahead */
+ lw $0, 0(a1) /* bring the first line of src, addr 0 */
+ lw $0, 32(a1) /* bring the second line of src, addr 32 */
+ lw $0, 64(a1) /* bring the third line of src, addr 64 */
+ lw $0, 32(a0) /* safe, as we have at least 64 bytes ahead */
/* In case the a0 > t9 don't use "pref 30" at all */
sgtu v1, a0, t9
bgtz v1, $ua_loop16w /* skip "pref 30, 64(a0)" for too short arrays */
nop
/* otherwise, start with using pref30 */
- pref 30, 64(a0)
+ lw $0, 64(a0)
$ua_loop16w:
- pref 0, 96(a1)
- LWHI t0, 0(a1)
- LWLO t0, 3(a1)
- LWHI t1, 4(a1)
+ lw $0, 96(a1)
+ ldl t0, 7(a1)
+ ldr t0, 0(a1)
bgtz v1, $ua_skip_pref30_96
- LWLO t1, 7(a1)
- pref 30, 96(a0) /* continue setting up the dest, addr 96 */
+ lw $0, 96(a0) /* continue setting up the dest, addr 96 */
$ua_skip_pref30_96:
- LWHI t2, 8(a1)
- LWLO t2, 11(a1)
- LWHI t3, 12(a1)
- LWLO t3, 15(a1)
- LWHI t4, 16(a1)
- LWLO t4, 19(a1)
- LWHI t5, 20(a1)
- LWLO t5, 23(a1)
- LWHI t6, 24(a1)
- LWLO t6, 27(a1)
- LWHI t7, 28(a1)
- LWLO t7, 31(a1)
- pref 0, 128(a1) /* bring the next lines of src, addr 128 */
-
- sw t0, 0(a0)
- sw t1, 4(a0)
- sw t2, 8(a0)
- sw t3, 12(a0)
- sw t4, 16(a0)
- sw t5, 20(a0)
- sw t6, 24(a0)
- sw t7, 28(a0)
-
- LWHI t0, 32(a1)
- LWLO t0, 35(a1)
- LWHI t1, 36(a1)
+ ldl t2, 15(a1)
+ ldr t2, 8(a1)
+ ldl t4, 23(a1)
+ ldr t4, 16(a1)
+ ldl t6, 31(a1)
+ ldr t6, 24(a1)
+ lw $0, 128(a1) /* bring the next lines of src, addr 128 */
+ lw $0, 0(a0)
+
+ sd t0, 0(a0)
+ sd t2, 8(a0)
+ sd t4, 16(a0)
+ sd t6, 24(a0)
+
+ ldl t0, 39(a1)
+ ldr t0, 32(a1)
bgtz v1, $ua_skip_pref30_128
- LWLO t1, 39(a1)
- pref 30, 128(a0) /* continue setting up the dest, addr 128 */
+ lw $0, 128(a0) /* continue setting up the dest, addr 128 */
$ua_skip_pref30_128:
- LWHI t2, 40(a1)
- LWLO t2, 43(a1)
- LWHI t3, 44(a1)
- LWLO t3, 47(a1)
- LWHI t4, 48(a1)
- LWLO t4, 51(a1)
- LWHI t5, 52(a1)
- LWLO t5, 55(a1)
- LWHI t6, 56(a1)
- LWLO t6, 59(a1)
- LWHI t7, 60(a1)
- LWLO t7, 63(a1)
- pref 0, 160(a1) /* bring the next lines of src, addr 160 */
-
- sw t0, 32(a0)
- sw t1, 36(a0)
- sw t2, 40(a0)
- sw t3, 44(a0)
- sw t4, 48(a0)
- sw t5, 52(a0)
- sw t6, 56(a0)
- sw t7, 60(a0)
-
- addiu a0, a0, 64 /* adding 64 to dest */
+ ldl t2, 47(a1)
+ ldr t2, 40(a1)
+ ldl t4, 55(a1)
+ ldr t4, 48(a1)
+ ldl t6, 63(a1)
+ ldr t6, 56(a1)
+ lw $0, 32(a0)
+ lw $0, 160(a1) /* bring the next lines of src, addr 160 */
+
+ sd t0, 32(a0)
+ sd t2, 40(a0)
+ sd t4, 48(a0)
+ sd t6, 56(a0)
+
+ daddiu a0, a0, 64 /* adding 64 to dest */
sgtu v1, a0, t9
bne a0, a3, $ua_loop16w
- addiu a1, a1, 64 /* adding 64 to src */
+ daddiu a1, a1, 64 /* adding 64 to src */
move a2, t8
/* Here we have src and dest word-aligned but less than 64-bytes to go */
$ua_chk8w:
- pref 0, 0x0(a1)
+ lw $0, 0x0(a1)
andi t8, a2, 0x1f /* is there a 32-byte chunk? */
/* the t8 is the reminder count */
beq a2, t8, $ua_chk1w /* when a2=t8, no 32-byte chunk */
- LWHI t0, 0(a1)
- LWLO t0, 3(a1)
- LWHI t1, 4(a1)
- LWLO t1, 7(a1)
- LWHI t2, 8(a1)
- LWLO t2, 11(a1)
- LWHI t3, 12(a1)
- LWLO t3, 15(a1)
- LWHI t4, 16(a1)
- LWLO t4, 19(a1)
- LWHI t5, 20(a1)
- LWLO t5, 23(a1)
- LWHI t6, 24(a1)
- LWLO t6, 27(a1)
- LWHI t7, 28(a1)
- LWLO t7, 31(a1)
- addiu a1, a1, 32
-
- sw t0, 0(a0)
- sw t1, 4(a0)
- sw t2, 8(a0)
- sw t3, 12(a0)
- sw t4, 16(a0)
- sw t5, 20(a0)
- sw t6, 24(a0)
- sw t7, 28(a0)
- addiu a0, a0, 32
+ ldl t0, 7(a1)
+ ldr t0, 0(a1)
+ ldl t2, 15(a1)
+ ldr t2, 8(a1)
+ ldl t4, 23(a1)
+ ldr t4, 16(a1)
+ ldl t6, 31(a1)
+ ldr t6, 24(a1)
+ lw $0, 0x0(a0)
+ daddiu a1, a1, 32
+
+ sd t0, 0(a0)
+ sd t2, 8(a0)
+ sd t4, 16(a0)
+ sd t6, 24(a0)
+ daddiu a0, a0, 32
$ua_chk1w:
andi a2, t8, 0x3 /* now a2 is the reminder past 1w chunks */
beq a2, t8, $ua_smallCopy
subu a3, t8, a2 /* a3 is count of bytes in 1w chunks */
- addu a3, a0, a3 /* now a3 is the dst address past the 1w chunks */
+ daddu a3, a0, a3 /* now a3 is the dst address past the 1w chunks */
/* copying in words (4-byte chunks) */
$ua_wordCopy_loop:
LWHI v1, 0(a1)
LWLO v1, 3(a1)
- addiu a1, a1, 4
- addiu a0, a0, 4 /* note: dst=a0 is word aligned here, see NOTE1 */
+ daddiu a1, a1, 4
+ daddiu a0, a0, 4 /* note: dst=a0 is word aligned here, see NOTE1 */
bne a0, a3, $ua_wordCopy_loop
sw v1, -4(a0)
/* Now less than 4 bytes (value in a2) left to copy */
$ua_smallCopy:
beqz a2, leave
- addu a3, a0, a2 /* a3 is the last dst address */
+ daddu a3, a0, a2 /* a3 is the last dst address */
$ua_smallCopy_loop:
lb v1, 0(a1)
- addiu a1, a1, 1
- addiu a0, a0, 1
+ daddiu a1, a1, 1
+ daddiu a0, a0, 1
bne a0, a3, $ua_smallCopy_loop
sb v1, -1(a0)
diff --git a/pixman/pixman-mmx.c b/pixman/pixman-mmx.c
index dec3974..edbf16b 100644
--- a/pixman/pixman-mmx.c
+++ b/pixman/pixman-mmx.c
@@ -59,6 +59,71 @@ _mm_empty (void)
}
#endif
+#define COMBINE_A_OUT 1
+#define COMBINE_A_IN 2
+#define COMBINE_B_OUT 4
+#define COMBINE_B_IN 8
+
+#define COMBINE_CLEAR 0
+#define COMBINE_A (COMBINE_A_OUT | COMBINE_A_IN)
+#define COMBINE_B (COMBINE_B_OUT | COMBINE_B_IN)
+#define COMBINE_A_OVER (COMBINE_A_OUT | COMBINE_B_OUT | COMBINE_A_IN)
+#define COMBINE_B_OVER (COMBINE_A_OUT | COMBINE_B_OUT | COMBINE_B_IN)
+#define COMBINE_A_ATOP (COMBINE_B_OUT | COMBINE_A_IN)
+#define COMBINE_B_ATOP (COMBINE_A_OUT | COMBINE_B_IN)
+#define COMBINE_XOR (COMBINE_A_OUT | COMBINE_B_OUT)
+
+/* no SIMD instructions for div, so leave it alone
+ * portion covered by a but not b
+ * min (1, (1-b) / a)
+ */
+static uint8_t
+combine_disjoint_out_part (uint8_t a, uint8_t b)
+{
+
+ b = ~b;
+ if (b >= a)
+ return MASK;
+ return DIV_UN8 (b, a);
+}
+
+/* portion covered by both a and b
+ * max (1-(1-b)/a, 0)
+ */
+static uint8_t
+combine_disjoint_in_part (uint8_t a, uint8_t b)
+{
+
+ b = ~b;
+ if (b >= a)
+ return 0;
+ return ~DIV_UN8(b, a);
+}
+
+/* portion covered by a but not b
+ * max (1-b/a ,0)
+ * */
+static uint8_t
+combine_conjoint_out_part (uint8_t a, uint8_t b)
+{
+
+ if (b >= a)
+ return 0x00;
+ return ~DIV_UN8(b, a);
+}
+
+/* portion covered by both a and b
+ * min (1, b/a)
+ */
+static uint8_t
+combine_conjoint_in_part (uint8_t a, uint8_t b)
+{
+
+ if (b >= a)
+ return MASK;
+ return DIV_UN8 (b, a);
+}
+
#ifdef USE_X86_MMX
# if (defined(__SUNPRO_C) || defined(_MSC_VER) || defined(_WIN64))
# include <xmmintrin.h>
@@ -78,7 +143,8 @@ _mm_movemask_pi8 (__m64 __A)
return ret;
}
-
+#define __OPTIMIZE__
+#ifdef __OPTIMIZE__
extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mulhi_pu16 (__m64 __A, __m64 __B)
{
@@ -88,7 +154,7 @@ _mm_mulhi_pu16 (__m64 __A, __m64 __B)
);
return __A;
}
-
+#else
# define _mm_shuffle_pi16(A, N) \
({ \
__m64 ret; \
@@ -102,7 +168,7 @@ _mm_mulhi_pu16 (__m64 __A, __m64 __B)
})
# endif
#endif
-
+#endif
#ifndef _MSC_VER
#define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \
(((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0))
@@ -710,6 +776,34 @@ combine (const uint32_t *src, const uint32_t *mask)
return vsrc;
}
+static force_inline void
+mmx_combine_mask_ca(const uint32_t *src, const uint32_t *mask, __m64 *s64, __m64 *m64)
+{
+ __m64 res, tmp;
+
+ if(!(*mask))
+ {
+ *s64 = 0;
+ *m64 = 0;
+ return;
+ }
+
+ *s64 = load8888(src);
+
+ if (*mask == ~0)
+ {
+ *m64 = expand_alpha(*s64);
+ return;
+ }
+
+ *m64 = load8888(mask);
+
+ res = pix_multiply(*s64, *m64);
+ tmp = expand_alpha(*s64);
+ *s64 = res;
+ *m64 = pix_multiply(*m64, tmp);
+}
+
static force_inline __m64
core_combine_over_u_pixel_mmx (__m64 vsrc, __m64 vdst)
{
@@ -729,6 +823,39 @@ core_combine_over_u_pixel_mmx (__m64 vsrc, __m64 vdst)
}
static void
+mmx_combine_disjoint_over_u (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dest,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
+{
+ uint32_t *end = dest + width;
+ uint32_t s32;
+ uint64_t sa64;
+ __m64 s64, d64;
+
+ while (dest < end)
+ {
+ s64 = combine (src, mask);
+
+ if (s64)
+ {
+ store8888(&s32, s64);
+ sa64 = combine_disjoint_out_part (*dest >> A_SHIFT, s32 >> A_SHIFT);
+ d64 = pix_add (pix_multiply (load8888 (dest),expand_alpha_rev ((*(__m64*)&sa64))), s64);
+ store8888 (dest, d64);
+ }
+
+ ++dest;
+ ++src;
+ if (mask)
+ ++mask;
+
+ }
+}
+
+static void
mmx_combine_over_u (pixman_implementation_t *imp,
pixman_op_t op,
uint32_t * dest,
@@ -1062,7 +1189,294 @@ mmx_combine_saturate_u (pixman_implementation_t *imp,
}
_mm_empty ();
}
+/* In functions such as ‘combine_conjoint_gerneral_u’, there are multiple branchs,determined by the parameter 'combine'.
+ * and this value will not change during functions operations,so it is not necessary to judge each value in the origin
+ * code. Can be judged at function entrance,and set the corresponding function pointer,can be called directly later.
+ */
+#define DEF_FUNC_ZERO_MASK(type, zm, suffix, res) \
+ static type inline combine_joint_ ##zm## _ ##suffix( type sa, type da, type io_flag) \
+ { \
+ return res; \
+ }
+
+/* 'conjoint' is same code structure as 'disjoint',the funtion name is different,set this macro to generate the corresponding
+ * function.The order of parameter is different,which is determined by 'io_flag',with '0' for 'in_part' and '1' for 'out_part'.
+ */
+#define DEF_FUNC_COMBINE_JOINT_U(cd, io) \
+ static uint8_t inline combine_ ##cd## joint_ ##io## _part_u(uint8_t sa, uint8_t da, uint8_t io_flag) \
+ { \
+ uint8_t parm[2]; \
+ parm[0] = sa * (io_flag ^ 0x1) + da * (io_flag ^ 0x0); \
+ parm[1] = sa * (io_flag ^ 0x0) + da * (io_flag ^ 0x1); \
+ return combine_ ##cd## joint_ ##io## _part (parm[0], parm[1]); \
+ }
+/* Sets the macro for the array of function pointers, storing the correct handler at the function entrance */
+#define DEF_COMB_FUNC_ARR(cd,SUFFIX,suffix) \
+ COMBINE_JOINT_FUNC_##SUFFIX combine_ ##cd## joint_ ##suffix[4] ={ \
+ combine_joint_zero_ ##suffix, \
+ combine_ ##cd## joint_out_part_ ##suffix, \
+ combine_ ##cd## joint_in_part_ ##suffix, \
+ combine_joint_mask_ ##suffix \
+ };
+
+typedef uint8_t (*COMBINE_JOINT_FUNC_U)(uint8_t a, uint8_t b, uint8_t io_flag);
+
+DEF_FUNC_ZERO_MASK(uint8_t,zero,u, 0x0)
+DEF_FUNC_ZERO_MASK(uint8_t,mask,u, ~0x0)
+
+DEF_FUNC_COMBINE_JOINT_U(dis, in);
+DEF_FUNC_COMBINE_JOINT_U(dis, out);
+DEF_COMB_FUNC_ARR(dis,U,u)
+
+DEF_FUNC_COMBINE_JOINT_U(con, in);
+DEF_FUNC_COMBINE_JOINT_U(con, out);
+DEF_COMB_FUNC_ARR(con, U, u)
+/* Set an underlying function,'conjoint' and 'disjoint' related functions can be called. */
+static void
+mmx_combine_joint_general_u (uint32_t * dest,
+ const uint32_t *src,
+ const uint32_t *mask,
+ int width,
+ uint8_t comb,
+ COMBINE_JOINT_FUNC_U *cjf)
+{
+ COMBINE_JOINT_FUNC_U combine_joint_u[2];
+ combine_joint_u[0] = cjf[comb & COMBINE_A]; /* in_part */
+ combine_joint_u[1] = cjf[(comb & COMBINE_B)>>2]; /* out_par */
+
+ uint32_t *end = dest + width;
+ while (dest < end)
+ {
+ __m64 s64 = combine (src, mask);
+ __m64 d64,sa64,da64;
+ uint8_t sa, da;
+ uint32_t tmp;
+ uint64_t Fa, Fb;
+
+ /* Because these function contain division instructions,
+ * multimedia instruction are not used to optimize them.
+ */
+ store8888(&tmp, s64);
+ sa = tmp >> A_SHIFT;
+ da = *dest >> A_SHIFT;
+
+ Fa = combine_joint_u[0](sa, da, 0);
+ Fb = combine_joint_u[1](sa, da, 1);
+
+ d64 = load8888(dest);
+ sa64 = expand_alpha_rev (*(__m64*)&Fa);
+ da64 = expand_alpha_rev (*(__m64*)&Fb);
+
+ d64 = pix_add_mul (s64, sa64, d64, da64);
+
+ store8888 (dest, d64);
+
+ ++dest;
+ ++src;
+ if (mask)
+ ++mask;
+ }
+}
+
+
+static void
+mmx_combine_disjoint_general_u (uint32_t * dest,
+ const uint32_t *src,
+ const uint32_t *mask,
+ int width,
+ uint8_t comb)
+{
+ mmx_combine_joint_general_u (dest, src, mask, width, comb, combine_disjoint_u);
+}
+
+static void
+mmx_combine_disjoint_in_u (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dest,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
+{
+ mmx_combine_disjoint_general_u (dest, src, mask, width, COMBINE_A_IN);
+}
+
+static void
+mmx_combine_disjoint_in_reverse_u (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dest,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
+{
+ mmx_combine_disjoint_general_u (dest, src, mask, width, COMBINE_B_IN);
+}
+
+static void
+mmx_combine_disjoint_out_u (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dest,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
+{
+ mmx_combine_disjoint_general_u (dest, src, mask, width, COMBINE_A_OUT);
+}
+
+static void
+mmx_combine_disjoint_out_reverse_u (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dest,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
+{
+ mmx_combine_disjoint_general_u (dest, src, mask, width, COMBINE_B_OUT);
+}
+
+static void
+mmx_combine_disjoint_atop_u (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dest,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
+{
+ mmx_combine_disjoint_general_u (dest, src, mask, width, COMBINE_A_ATOP);
+}
+
+static void
+mmx_combine_disjoint_atop_reverse_u (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dest,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
+{
+ mmx_combine_disjoint_general_u (dest, src, mask, width, COMBINE_B_ATOP);
+}
+
+static void
+mmx_combine_disjoint_xor_u (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dest,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
+{
+ mmx_combine_disjoint_general_u (dest, src, mask, width, COMBINE_XOR);
+}
+
+/* Conjoint */
+static void
+mmx_combine_conjoint_general_u(uint32_t * dest,
+ const uint32_t *src,
+ const uint32_t *mask,
+ int width,
+ uint8_t comb)
+{
+ mmx_combine_joint_general_u (dest, src, mask, width, comb, combine_conjoint_u);
+}
+
+static void
+mmx_combine_conjoint_over_u (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dest,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
+{
+ mmx_combine_conjoint_general_u (dest, src, mask, width, COMBINE_A_OVER);
+}
+
+static void
+mmx_combine_conjoint_over_reverse_u (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dest,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
+{
+ mmx_combine_conjoint_general_u (dest, src, mask, width, COMBINE_B_OVER);
+}
+
+static void
+mmx_combine_conjoint_in_u (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dest,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
+{
+ mmx_combine_conjoint_general_u (dest, src, mask, width, COMBINE_A_IN);
+}
+
+static void
+mmx_combine_conjoint_in_reverse_u (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dest,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
+{
+ mmx_combine_conjoint_general_u (dest, src, mask, width, COMBINE_B_IN);
+}
+
+static void
+mmx_combine_conjoint_out_u (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dest,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
+{
+ mmx_combine_conjoint_general_u (dest, src, mask, width, COMBINE_A_OUT);
+}
+
+static void
+mmx_combine_conjoint_out_reverse_u (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dest,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
+{
+ mmx_combine_conjoint_general_u (dest, src, mask, width, COMBINE_B_OUT);
+}
+
+static void
+mmx_combine_conjoint_atop_u (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dest,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
+{
+ mmx_combine_conjoint_general_u (dest, src, mask, width, COMBINE_A_ATOP);
+}
+
+static void
+mmx_combine_conjoint_atop_reverse_u (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dest,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
+{
+ mmx_combine_conjoint_general_u (dest, src, mask, width, COMBINE_B_ATOP);
+}
+
+static void
+mmx_combine_conjoint_xor_u (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dest,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
+{
+ mmx_combine_conjoint_general_u (dest, src, mask, width, COMBINE_XOR);
+}
+/* Component alpha combiners */
static void
mmx_combine_src_ca (pixman_implementation_t *imp,
pixman_op_t op,
@@ -1089,6 +1503,410 @@ mmx_combine_src_ca (pixman_implementation_t *imp,
}
static void
+mmx_combine_saturate_ca (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dest,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
+{
+ uint32_t *end = dest + width;
+ while (dest < end)
+ {
+ uint16_t sa, sr, sg, sb;
+ uint32_t sa32, m32;
+ __m64 m64, s64, d64, sa64, da64, cmpf, res;
+
+ mmx_combine_mask_ca (src, mask, &s64, &m64);
+
+ d64 = load8888 (dest);
+ da64 = expand_alpha (negate(d64));
+ cmpf = _mm_cmpgt_pi16 (m64, da64);
+ if (cmpf)
+ {
+ store8888 (&m32, m64);
+ sa = (m32 >> (A_SHIFT));
+ sr = (m32 >> (R_SHIFT)) & MASK;
+ sg = (m32 >> (G_SHIFT)) & MASK;
+ sb = m32 & MASK;
+ sa32 = (~(*dest) >> A_SHIFT) & MASK;
+
+ sa = (sa) ? sa : 0x1;
+ sr = (sr) ? sr : 0x1;
+ sg = (sg) ? sg : 0x1;
+ sb = (sb) ? sb : 0x1;
+
+ sa32 = ((sa32 << G_SHIFT) / sb & MASK) |
+ ((((sa32 << G_SHIFT) / sg) & MASK) << G_SHIFT) |
+ ((((sa32 << G_SHIFT) / sr) & MASK) << R_SHIFT) |
+ ((((sa32 << G_SHIFT) / sa) & MASK) << A_SHIFT);
+ sa64 = load8888 (&sa32);
+ da64 = MC (4x00ff);
+ res = pix_multiply (s64, sa64);
+ s64 = _mm_or_si64 (_mm_and_si64 (res, cmpf), _mm_and_si64 (s64, negate (cmpf)));
+ res = pix_multiply (d64, da64);
+ d64 = _mm_or_si64 (_mm_and_si64 (res, cmpf), _mm_and_si64 (d64, negate (cmpf)));
+ }
+ res = _mm_adds_pu8 (s64, d64);
+ store8888 (dest, res);
+
+ ++dest;
+ ++src;
+ if (mask)
+ ++mask;
+ }
+}
+
+#define DEF_FUNC_COMBINE_JOINT_CA(cd, io) \
+ static uint32_t inline combine_ ##cd## joint_ ##io## _part_ca(uint32_t sa, uint32_t da, uint32_t io_flag) \
+ { \
+ uint8_t da8 = da >> A_SHIFT; \
+ uint32_t m, n, o, p, res; \
+ uint8_t i, parm[2][4], shift=0; \
+ for (i=0; i<4; i++) \
+ { \
+ parm[0][i] = (uint8_t)(sa>>shift) * (io_flag ^ 0x1) + da8 * (io_flag ^ 0x0); \
+ parm[1][i] = (uint8_t)(sa>>shift) * (io_flag ^ 0x0) + da8 * (io_flag ^ 0x1); \
+ shift += G_SHIFT; \
+ } \
+ m = (uint32_t)combine_ ##cd## joint_ ##io## _part (parm[0][0], parm[1][0]); \
+ n = (uint32_t)combine_ ##cd## joint_ ##io## _part (parm[0][1], parm[1][1]) << G_SHIFT; \
+ o = (uint32_t)combine_ ##cd## joint_ ##io## _part (parm[0][2], parm[1][2]) << R_SHIFT; \
+ p = (uint32_t)combine_ ##cd## joint_ ##io## _part (parm[0][3], parm[1][3]) << A_SHIFT; \
+ res = m | n | o | p; \
+ return res; \
+ }
+
+typedef uint32_t (*COMBINE_JOINT_FUNC_CA)(uint32_t sa, uint32_t da, uint32_t io_flag);
+
+DEF_FUNC_ZERO_MASK(uint32_t, zero, ca, 0x0)
+DEF_FUNC_ZERO_MASK(uint32_t, mask, ca, ~0x0)
+
+DEF_FUNC_COMBINE_JOINT_CA(dis, in);
+DEF_FUNC_COMBINE_JOINT_CA(dis, out);
+DEF_COMB_FUNC_ARR(dis, CA, ca)
+
+DEF_FUNC_COMBINE_JOINT_CA(con, in);
+DEF_FUNC_COMBINE_JOINT_CA(con, out);
+DEF_COMB_FUNC_ARR(con, CA, ca)
+
+static void
+mmx_combine_joint_general_ca (uint32_t * dest,
+ const uint32_t *src,
+ const uint32_t *mask,
+ int width,
+ uint8_t comb,
+ COMBINE_JOINT_FUNC_CA *cjf)
+{
+ COMBINE_JOINT_FUNC_CA combine_joint_ca[2];
+ combine_joint_ca[0] = cjf[comb & COMBINE_A];
+ combine_joint_ca[1] = cjf[(comb & COMBINE_B)>>2];
+
+ uint32_t *end = dest + width;
+ while (dest < end)
+ {
+ __m64 m64, s64, sa64, da64, d64;
+ uint32_t m32, Fa, Fb;
+
+ mmx_combine_mask_ca (src, mask, &s64, &m64);
+ store8888(&m32, m64);
+
+ Fa = combine_joint_ca[0](m32, *dest, 0);
+ Fb = combine_joint_ca[1](m32, *dest, 1);
+
+ sa64 = load8888 (&Fa);
+ da64 = load8888 (&Fb);
+
+ d64 = load8888 (dest);
+ d64 = pix_add_mul(s64, sa64, d64, da64);
+
+ store8888 (dest, d64);
+
+ ++dest;
+ ++src;
+ if (mask)
+ ++mask;
+ }
+
+}
+
+static void
+mmx_combine_disjoint_general_ca (uint32_t * dest,
+ const uint32_t *src,
+ const uint32_t *mask,
+ int width,
+ uint8_t comb)
+{
+ mmx_combine_joint_general_ca (dest, src, mask, width, comb, combine_disjoint_ca);
+}
+
+static void
+mmx_combine_disjoint_over_ca (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dest,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
+{
+ mmx_combine_disjoint_general_ca (dest, src, mask, width, COMBINE_A_OVER);
+}
+
+static void
+mmx_combine_disjoint_in_ca (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dest,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
+{
+ mmx_combine_disjoint_general_ca (dest, src, mask, width, COMBINE_A_IN);
+}
+
+static void
+mmx_combine_disjoint_in_reverse_ca (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dest,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
+{
+ mmx_combine_disjoint_general_ca (dest, src, mask, width, COMBINE_B_IN);
+}
+
+static void
+mmx_combine_disjoint_out_ca (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dest,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
+{
+ mmx_combine_disjoint_general_ca (dest, src, mask, width, COMBINE_A_OUT);
+}
+
+static void
+mmx_combine_disjoint_out_reverse_ca (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dest,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
+{
+ mmx_combine_disjoint_general_ca (dest, src, mask, width, COMBINE_B_OUT);
+}
+
+static void
+mmx_combine_disjoint_atop_ca (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dest,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
+{
+ mmx_combine_disjoint_general_ca (dest, src, mask, width, COMBINE_A_ATOP);
+}
+
+static void
+mmx_combine_disjoint_atop_reverse_ca (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dest,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
+{
+ mmx_combine_disjoint_general_ca (dest, src, mask, width, COMBINE_B_ATOP);
+}
+
+static void
+mmx_combine_disjoint_xor_ca (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dest,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
+{
+ mmx_combine_disjoint_general_ca (dest, src, mask, width, COMBINE_XOR);
+}
+
+static void
+mmx_combine_conjoint_general_ca(uint32_t * dest,
+ const uint32_t *src,
+ const uint32_t *mask,
+ int width,
+ uint8_t comb)
+{
+ mmx_combine_joint_general_ca(dest,src,mask,width,comb,combine_conjoint_ca);
+}
+
+/*
+ * Multiply
+ * B(Dca, ad, Sca, as) = Dca.Sca
+ */
+
+static void
+mmx_combine_multiply_u (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dest,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
+{
+ const uint32_t *end = dest + width;
+
+ while (dest < end)
+ {
+ __m64 dia, d, sia;
+ __m64 s = combine (src, mask);
+ __m64 ss = s;
+ d = load8888 (dest);
+ sia = negate (expand_alpha (s));
+ dia = negate (expand_alpha (d));
+ ss = pix_add_mul (ss, dia, d, sia);
+ d = pix_multiply (d, s);
+ d = pix_add (d, ss);
+ store8888 (dest, d);
+
+ ++dest;
+ ++src;
+ if (mask)
+ mask++;
+ }
+ _mm_empty ();
+}
+
+static void
+mmx_combine_multiply_ca (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dest,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
+{
+ const uint32_t *end = dest + width;
+
+ while (dest < end)
+ {
+ __m64 a = load8888 (mask);
+ __m64 s = load8888 (src);
+ __m64 d = load8888 (dest);
+ __m64 r = d;
+ __m64 da = negate (expand_alpha (d));
+ __m64 sa = expand_alpha (s);
+ s = pix_multiply (s, a);
+ a = pix_multiply (a, sa);
+ a = negate (a);
+ r = pix_add_mul (r, a, s, da);
+ d = pix_multiply (d, s);
+ r = pix_add (r, d);
+ store8888 (dest, r);
+
+ ++src;
+ ++dest;
+ ++mask;
+ }
+ _mm_empty ();
+}
+
+static void
+mmx_combine_conjoint_over_ca (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dest,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
+{
+ mmx_combine_conjoint_general_ca (dest, src, mask, width, COMBINE_A_OVER);
+}
+
+static void
+mmx_combine_conjoint_over_reverse_ca (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dest,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
+{
+ mmx_combine_conjoint_general_ca (dest, src, mask, width, COMBINE_B_OVER);
+}
+
+static void
+mmx_combine_conjoint_in_ca (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dest,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
+{
+ mmx_combine_conjoint_general_ca (dest, src, mask, width, COMBINE_A_IN);
+}
+
+static void
+mmx_combine_conjoint_in_reverse_ca (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dest,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
+{
+ mmx_combine_conjoint_general_ca (dest, src, mask, width, COMBINE_B_IN);
+}
+
+static void
+mmx_combine_conjoint_out_ca (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dest,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
+{
+ mmx_combine_conjoint_general_ca (dest, src, mask, width, COMBINE_A_OUT);
+}
+
+static void
+mmx_combine_conjoint_out_reverse_ca (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dest,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
+{
+ mmx_combine_conjoint_general_ca (dest, src, mask, width, COMBINE_B_OUT);
+}
+
+static void
+mmx_combine_conjoint_atop_ca (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dest,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
+{
+ mmx_combine_conjoint_general_ca (dest, src, mask, width, COMBINE_A_ATOP);
+}
+
+static void
+mmx_combine_conjoint_atop_reverse_ca (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dest,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
+{
+ mmx_combine_conjoint_general_ca (dest, src, mask, width, COMBINE_B_ATOP);
+}
+
+static void
+mmx_combine_conjoint_xor_ca (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dest,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
+{
+ mmx_combine_conjoint_general_ca (dest, src, mask, width, COMBINE_XOR);
+}
+
+static void
mmx_combine_over_ca (pixman_implementation_t *imp,
pixman_op_t op,
uint32_t * dest,
@@ -2089,23 +2907,34 @@ mmx_fill (pixman_implementation_t *imp,
stride = stride * (int) sizeof (uint32_t) / 1;
byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
byte_width = width;
- stride *= 1;
+/*non necessary???*/
+/* stride *= 1; */
filler = (filler & 0xff) * 0x01010101;
}
else if (bpp == 16)
{
stride = stride * (int) sizeof (uint32_t) / 2;
byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
+#if 0
byte_width = 2 * width;
stride *= 2;
+#else
+ byte_width = width << 1;
+ stride <<= 1;
+#endif
filler = (filler & 0xffff) * 0x00010001;
}
else
{
stride = stride * (int) sizeof (uint32_t) / 4;
byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
+#if 0
byte_width = 4 * width;
stride *= 4;
+#else
+ byte_width = width << 2;
+ stride <<= 2;
+#endif
}
fill = ((uint64_t)filler << 32) | filler;
@@ -3274,9 +4103,15 @@ mmx_blt (pixman_implementation_t *imp,
dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
src_bytes = (uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
+#if 0
byte_width = 2 * width;
src_stride *= 2;
dst_stride *= 2;
+#else
+ byte_width = width << 1;
+ src_stride <<= 1;
+ dst_stride <<= 1;
+#endif
}
else if (src_bpp == 32)
{
@@ -3284,9 +4119,15 @@ mmx_blt (pixman_implementation_t *imp,
dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
+#if 0
byte_width = 4 * width;
src_stride *= 4;
dst_stride *= 4;
+#else
+ byte_width = width << 2;
+ src_stride <<= 2;
+ dst_stride <<= 2;
+#endif
}
else
{
@@ -4003,6 +4844,186 @@ static const pixman_iter_info_t mmx_iters[] =
{ PIXMAN_null },
};
+#define MMX_PDF_SEPARABLE_BLEND_MODE(name) \
+static void \
+mmx_combine_ ## name ## _u (pixman_implementation_t *imp, \
+ pixman_op_t op, \
+ uint32_t * dest, \
+ const uint32_t * src, \
+ const uint32_t * mask, \
+ int width) \
+{ \
+ int i; \
+ for (i = 0; i < width; ++i) { \
+ __m64 s = load8888(src + i); \
+ __m64 d = load8888(dest + i); \
+ __m64 da = expand_alpha(d); \
+ \
+ if(mask) \
+ { \
+ __m64 m = load8888(mask + i); \
+ __m64 ma = expand_alpha(m); \
+ s = pix_multiply(s,ma); \
+ } \
+ __m64 sa = expand_alpha(s); \
+ \
+ __m64 isa = negate(sa); \
+ __m64 ida = negate(da); \
+ \
+ uint32_t result,sada,res; \
+ __m64 temp; \
+ store8888(&result,pix_add_mul(d,isa,s,ida)); \
+ store8888(&sada,pix_multiply(sa,da)); \
+ store8888(&res,mmx_blend_ ## name(d,da,s,sa)); \
+ \
+ sada &= A_MASK; \
+ res &= RGB_MASK; \
+ temp = pix_add( pix_add(load8888(&result), load8888(&sada)), \
+ load8888(&res)); \
+ store8888(dest+i, temp); \
+ } \
+} \
+static void \
+mmx_combine_ ## name ## _ca (pixman_implementation_t *imp, \
+ pixman_op_t op, \
+ uint32_t * dest, \
+ const uint32_t * src, \
+ const uint32_t * mask, \
+ int width) \
+ { \
+ int i; \
+ for (i = 0; i < width; ++i) { \
+ __m64 m = load8888(mask + i); \
+ __m64 s = load8888(src + i); \
+ __m64 d = load8888(dest + i); \
+ __m64 sa = expand_alpha(s); \
+ __m64 da = expand_alpha(d); \
+ __m64 ida = negate(da); \
+ \
+ s = pix_multiply(s,m); \
+ m = pix_multiply(m,sa); \
+ __m64 im = negate(m); \
+ __m64 ima = expand_alpha(m); \
+ \
+ uint32_t result,mada,res; \
+ __m64 temp; \
+ store8888(&result,pix_add_mul(d,im,s,ida)); \
+ store8888(&mada,pix_multiply(ima,da)); \
+ store8888(&res,mmx_blend_ ## name(d,da,s,m)); \
+ \
+ mada &= A_MASK; \
+ res &= RGB_MASK; \
+ temp = pix_add( pix_add(load8888(&result), load8888(&mada)), \
+ load8888(&res)); \
+ store8888(dest+i, temp); \
+ } \
+} \
+
+static inline __m64
+_emulate_pminuh(__m64 s, __m64 d)
+{
+ uint64_t tmp_s = to_uint64(s);
+ uint64_t tmp_d = to_uint64(d);
+
+ __m64 res = to_m64(MIN((tmp_s & R_DMASK), (tmp_d & R_DMASK))
+ | MIN((tmp_s & G_DMASK), (tmp_d & G_DMASK))
+ | MIN((tmp_s & B_DMASK), (tmp_d & B_DMASK)));
+
+ return res;
+}
+
+static inline __m64
+_emulate_pmaxuh(__m64 s, __m64 d)
+{
+ uint64_t tmp_s = to_uint64(s);
+ uint64_t tmp_d = to_uint64(d);
+
+ __m64 res = to_m64(MAX((tmp_s & R_DMASK), (tmp_d & R_DMASK))
+ | MAX((tmp_s & G_DMASK), (tmp_d & G_DMASK))
+ | MAX((tmp_s & B_DMASK), (tmp_d & B_DMASK)));
+
+ return res;
+}
+
+#define R_GREATER(a, b) ((a > b) ? 0x0000ffff00000000ULL : 0)
+#define G_GREATER(a, b) ((a > b) ? 0x00000000ffff0000ULL : 0)
+#define B_GREATER(a, b) ((a > b) ? 0x000000000000ffffULL : 0)
+
+static inline __m64
+_emulate_pcmpgtuh(__m64 s, __m64 d)
+{
+ uint64_t tmp_s = to_uint64(s);
+ uint64_t tmp_d = to_uint64(d);
+
+ __m64 res = to_m64(R_GREATER((tmp_s & R_DMASK), (tmp_d & R_DMASK))
+ | G_GREATER((tmp_s & G_DMASK), (tmp_d & G_DMASK))
+ | B_GREATER((tmp_s & B_DMASK), (tmp_d & B_DMASK)));
+
+ return res;
+}
+
+static inline __m64
+_emulate_paddcmpgtuh(__m64 s, __m64 d1, __m64 d2)
+{
+ uint64_t tmp_s = to_uint64(s);
+ uint64_t tmp_d1 = to_uint64(d1);
+ uint64_t tmp_d2 = to_uint64(d2);
+
+ __m64 res = to_m64(R_GREATER((tmp_s & R_DMASK), (tmp_d1 & R_DMASK) + (tmp_d2 & R_DMASK))
+ | G_GREATER((tmp_s & G_DMASK), (tmp_d1 & G_DMASK) + (tmp_d2 & G_DMASK))
+ | B_GREATER((tmp_s & B_DMASK), (tmp_d1 & B_DMASK) + (tmp_d2 & B_DMASK)));
+
+ return res;
+}
+
+
+/*
+ * Darken
+ * B(Dca, Da, Sca, Sa) = min (Sca.Da, Dca.Sa)
+ */
+static inline __m64
+mmx_blend_darken (__m64 dca, __m64 da, __m64 sca, __m64 sa)
+{
+ __m64 res;
+
+ __m64 s = _mm_mullo_pi16(sca,da);
+ __m64 d = _mm_mullo_pi16(dca,sa);
+
+
+ res = _emulate_pminuh(s, d);
+ res = _mm_adds_pu16(res,MC(4x0080));
+ res = _mm_mulhi_pu16(res,MC(4x0101));
+
+ return res;
+}
+
+MMX_PDF_SEPARABLE_BLEND_MODE (darken)
+
+/*
+ * Lighten
+ * B(Dca, Da, Sca, Sa) = max (Sca.Da, Dca.Sa)
+ */
+static inline __m64
+mmx_blend_lighten (__m64 dca, __m64 da, __m64 sca, __m64 sa)
+{
+ __m64 res;
+
+ __m64 s = _mm_mullo_pi16(sca,da);
+ __m64 d = _mm_mullo_pi16(dca,sa);
+
+ res = _emulate_pmaxuh(s, d);
+ res = _mm_adds_pu16(res,MC(4x0080));
+ res = _mm_mulhi_pu16(res,MC(4x0101));
+
+ return res;
+}
+
+MMX_PDF_SEPARABLE_BLEND_MODE (lighten)
+
+
+#undef MMX_PDF_SEPARABLE_BLEND_MODE
+
+
static const pixman_fast_path_t mmx_fast_paths[] =
{
PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, mmx_composite_over_n_8_0565 ),
@@ -4114,8 +5135,37 @@ _pixman_implementation_create_mmx (pixman_implementation_t *fallback)
{
pixman_implementation_t *imp = _pixman_implementation_create (fallback, mmx_fast_paths);
+ /* Unified alpha */
imp->combine_32[PIXMAN_OP_OVER] = mmx_combine_over_u;
imp->combine_32[PIXMAN_OP_OVER_REVERSE] = mmx_combine_over_reverse_u;
+ /* Disjoint, unified */
+ imp->combine_32[PIXMAN_OP_DISJOINT_OVER] = mmx_combine_disjoint_over_u;
+ imp->combine_32[PIXMAN_OP_DISJOINT_OVER_REVERSE] = mmx_combine_saturate_u;
+ imp->combine_32[PIXMAN_OP_DISJOINT_IN] = mmx_combine_disjoint_in_u;
+ imp->combine_32[PIXMAN_OP_DISJOINT_IN_REVERSE] = mmx_combine_disjoint_in_reverse_u;
+ imp->combine_32[PIXMAN_OP_DISJOINT_OUT] = mmx_combine_disjoint_out_u;
+ imp->combine_32[PIXMAN_OP_DISJOINT_OUT_REVERSE] = mmx_combine_disjoint_out_reverse_u;
+ imp->combine_32[PIXMAN_OP_DISJOINT_ATOP] = mmx_combine_disjoint_atop_u;
+ imp->combine_32[PIXMAN_OP_DISJOINT_ATOP_REVERSE] = mmx_combine_disjoint_atop_reverse_u;
+ imp->combine_32[PIXMAN_OP_DISJOINT_XOR] = mmx_combine_disjoint_xor_u;
+
+ /* Conjoint, unified */
+ imp->combine_32[PIXMAN_OP_CONJOINT_OVER] = mmx_combine_conjoint_over_u;
+ imp->combine_32[PIXMAN_OP_CONJOINT_OVER_REVERSE] = mmx_combine_conjoint_over_reverse_u;
+ imp->combine_32[PIXMAN_OP_CONJOINT_IN] = mmx_combine_conjoint_in_u;
+ imp->combine_32[PIXMAN_OP_CONJOINT_IN_REVERSE] = mmx_combine_conjoint_in_reverse_u;
+ imp->combine_32[PIXMAN_OP_CONJOINT_OUT] = mmx_combine_conjoint_out_u;
+ imp->combine_32[PIXMAN_OP_CONJOINT_OUT_REVERSE] = mmx_combine_conjoint_out_reverse_u;
+ imp->combine_32[PIXMAN_OP_CONJOINT_ATOP] = mmx_combine_conjoint_atop_u;
+ imp->combine_32[PIXMAN_OP_CONJOINT_ATOP_REVERSE] = mmx_combine_conjoint_atop_reverse_u;
+ imp->combine_32[PIXMAN_OP_CONJOINT_XOR] = mmx_combine_conjoint_xor_u;
+
+ /* Multiply, Unified */
+ imp->combine_32[PIXMAN_OP_MULTIPLY] = mmx_combine_multiply_u;
+ imp->combine_32[PIXMAN_OP_DARKEN] = mmx_combine_darken_u;
+ imp->combine_32[PIXMAN_OP_LIGHTEN] = mmx_combine_lighten_u;
+
+ /* Component alpha combiners */
imp->combine_32[PIXMAN_OP_IN] = mmx_combine_in_u;
imp->combine_32[PIXMAN_OP_IN_REVERSE] = mmx_combine_in_reverse_u;
imp->combine_32[PIXMAN_OP_OUT] = mmx_combine_out_u;
@@ -4137,7 +5187,35 @@ _pixman_implementation_create_mmx (pixman_implementation_t *fallback)
imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = mmx_combine_atop_reverse_ca;
imp->combine_32_ca[PIXMAN_OP_XOR] = mmx_combine_xor_ca;
imp->combine_32_ca[PIXMAN_OP_ADD] = mmx_combine_add_ca;
-
+ imp->combine_32_ca[PIXMAN_OP_SATURATE] = mmx_combine_saturate_ca;
+
+ /* Disjoint CA */
+ imp->combine_32_ca[PIXMAN_OP_DISJOINT_OVER] = mmx_combine_disjoint_over_ca;
+ imp->combine_32_ca[PIXMAN_OP_DISJOINT_OVER_REVERSE] = mmx_combine_saturate_ca;
+ imp->combine_32_ca[PIXMAN_OP_DISJOINT_IN] = mmx_combine_disjoint_in_ca;
+ imp->combine_32_ca[PIXMAN_OP_DISJOINT_IN_REVERSE] = mmx_combine_disjoint_in_reverse_ca;
+ imp->combine_32_ca[PIXMAN_OP_DISJOINT_OUT] = mmx_combine_disjoint_out_ca;
+ imp->combine_32_ca[PIXMAN_OP_DISJOINT_OUT_REVERSE] = mmx_combine_disjoint_out_reverse_ca;
+ imp->combine_32_ca[PIXMAN_OP_DISJOINT_ATOP] = mmx_combine_disjoint_atop_ca;
+ imp->combine_32_ca[PIXMAN_OP_DISJOINT_ATOP_REVERSE] = mmx_combine_disjoint_atop_reverse_ca;
+ imp->combine_32_ca[PIXMAN_OP_DISJOINT_XOR] = mmx_combine_disjoint_xor_ca;
+
+ /* Conjoint CA */
+ imp->combine_32_ca[PIXMAN_OP_CONJOINT_OVER] = mmx_combine_conjoint_over_ca;
+ imp->combine_32_ca[PIXMAN_OP_CONJOINT_OVER_REVERSE] = mmx_combine_conjoint_over_reverse_ca;
+ imp->combine_32_ca[PIXMAN_OP_CONJOINT_IN] = mmx_combine_conjoint_in_ca;
+ imp->combine_32_ca[PIXMAN_OP_CONJOINT_IN_REVERSE] = mmx_combine_conjoint_in_reverse_ca;
+ imp->combine_32_ca[PIXMAN_OP_CONJOINT_OUT] = mmx_combine_conjoint_out_ca;
+ imp->combine_32_ca[PIXMAN_OP_CONJOINT_OUT_REVERSE] = mmx_combine_conjoint_out_reverse_ca;
+ imp->combine_32_ca[PIXMAN_OP_CONJOINT_ATOP] = mmx_combine_conjoint_atop_ca;
+ imp->combine_32_ca[PIXMAN_OP_CONJOINT_ATOP_REVERSE] = mmx_combine_conjoint_atop_reverse_ca;
+ imp->combine_32_ca[PIXMAN_OP_CONJOINT_XOR] = mmx_combine_conjoint_xor_ca;
+
+ /* Multiply CA */
+ imp->combine_32_ca[PIXMAN_OP_MULTIPLY] = mmx_combine_multiply_ca;
+ imp->combine_32_ca[PIXMAN_OP_DARKEN] = mmx_combine_darken_ca;
+ imp->combine_32_ca[PIXMAN_OP_LIGHTEN] = mmx_combine_lighten_ca;
+
imp->blt = mmx_blt;
imp->fill = mmx_fill;
diff --git a/pixman/pixman-private.h b/pixman/pixman-private.h
index 73a5414..93660b6 100644
--- a/pixman/pixman-private.h
+++ b/pixman/pixman-private.h
@@ -916,9 +916,39 @@ convert_8888_to_0565 (uint32_t s)
static force_inline uint32_t
convert_0565_to_0888 (uint16_t s)
{
+ uint32_t ret;
+#if USE_LOONGSON_MMI
+ asm(".set noreorder\r\n"
+ "sll $8, %1, 3\r\n"
+ "andi $8, 0xf8\r\n"
+ "sll $6, %1, 5\r\n"
+ "andi $6, 0xfc00\r\n"
+ "sll $4, %1, 8\r\n"
+ "li $2, 0xf80000\r\n"
+ "and $4, $2\r\n"
+ "or $6, $6, $4\r\n"
+ "or $8, $6\r\n"
+ "srl $4, %1, 2\r\n"
+ "andi $4, 0x7\r\n"
+ "srl $6, %1, 1\r\n"
+ "andi $6, 0x300\r\n"
+ "or $6, $6, $4\r\n"
+ "or $8, $6\r\n"
+ "sll $6, %1, 3\r\n"
+ "li $2, 0x70000\r\n"
+ "and $6, $2\r\n"
+ "or %0, $8, $6\r\n"
+ ".set reorder\r\n"
+ : "=r" (ret)
+ : "r" (s)
+ : "$8","$6","$4","$2"
+ );
+#else
return (((((s) << 3) & 0xf8) | (((s) >> 2) & 0x7)) |
((((s) << 5) & 0xfc00) | (((s) >> 1) & 0x300)) |
((((s) << 8) & 0xf80000) | (((s) << 3) & 0x70000)));
+#endif
+ return ret;
}
static force_inline uint32_t
@@ -991,7 +1021,7 @@ unorm_to_unorm (uint32_t val, int from_bits, int to_bits)
{ \
result |= result >> from_bits; \
\
- from_bits *= 2; \
+ from_bits <<= 1; \
} \
} \
while (0)
diff --git a/pixman/pixman-solid-fill.c b/pixman/pixman-solid-fill.c
index 4694ebc..c0ca417 100644
--- a/pixman/pixman-solid-fill.c
+++ b/pixman/pixman-solid-fill.c
@@ -40,12 +40,53 @@ static argb_t
color_to_float (const pixman_color_t *color)
{
argb_t result;
+#ifdef USE_LOONGSON_MMI
+ uint32_t a = color->alpha;
+ uint32_t r = color->red;
+ uint32_t g = color->green;
+ uint32_t b = color->blue;
+ uint32_t m;
+ float tmp;
+ float counta, countr, countg, countb;
+ /*m=((1<<16)-1)*/
+ m=65535;
+ /* tmp=1.f / (float)m;*/
+ float data = 65535.f;
+ asm(".set noreorder\r\n"
+ "recip.s %4,%5\r\n"
- result.a = pixman_unorm_to_float (color->alpha, 16);
- result.r = pixman_unorm_to_float (color->red, 16);
- result.g = pixman_unorm_to_float (color->green, 16);
- result.b = pixman_unorm_to_float (color->blue, 16);
+ "mtc1 %6, $f0\r\n"
+ "cvt.s.w $f2, $f0\r\n"
+ "mul.s %0,$f2,%4\r\n"
+ "mtc1 %7, $f10\r\n"
+ "cvt.s.w $f4, $f10\r\n"
+ "mul.s %1,$f4,%4\r\n"
+
+ "mtc1 %8, $f12\r\n"
+ "cvt.s.w $f6, $f12\r\n"
+ "mul.s %2,$f6,%4\r\n"
+
+ "mtc1 %9, $f14\r\n"
+ "cvt.s.w $f8, $f14\r\n"
+ "mul.s %3,$f8,%4\r\n"
+
+ ".set reorder\r\n"
+ :"=f"(counta),"=f"(countr),"=f"(countg),"=f"(countb),"=f"(tmp)
+ :"f"(data),"r"(a),"r" (r),"r" (g),"r" (b)
+ :"$f0","$f2","$f4","$f6","$f8","$f10","$f12","$f14"
+ );
+
+ result.a = counta;
+ result.r = countr;
+ result.g = countg;
+ result.b = countb;
+#else
+ result.a = pixman_unorm_to_float(color->alpha, 16);
+ result.r = pixman_unorm_to_float(color->red, 16);
+ result.g = pixman_unorm_to_float(color->green, 16);
+ result.b = pixman_unorm_to_float(color->blue, 16);
+#endif
return result;
}
diff --git a/pixman/pixman-utils.c b/pixman/pixman-utils.c
index 4a3a835..51f5cd8 100644
--- a/pixman/pixman-utils.c
+++ b/pixman/pixman-utils.c
@@ -80,28 +80,73 @@ pixman_malloc_abc (unsigned int a,
return malloc (a * b * c);
}
+
static force_inline uint16_t
float_to_unorm (float f, int n_bits)
{
uint32_t u;
- if (f > 1.0)
- f = 1.0;
- if (f < 0.0)
- f = 0.0;
-
- u = f * (1 << n_bits);
- u -= (u >> n_bits);
-
+ if (f >= 1.0)
+ {
+ u = 1 << (n_bits);
+ u--;
+ return u;
+ }
+ else if (f <= 0.0)
+ {
+ return 0.0;
+ }
+ else
+ {
+#ifdef USE_LOONGSON_MMI
+ asm(".set noreorder\r\n"
+ "li $8, 0x1\r\n"
+ "sll $8, %2\r\n"
+ "mtc1 $8, $f2\r\n"
+ "cvt.s.w $f0, $f2\r\n"
+ "mul.s $f0, $f0, %1\r\n"
+ "floor.w.s %0, $f0\r\n"
+ ".set reorder\r\n"
+ : "=f" (u)
+ : "f" (f), "r" (n_bits)
+ : "$8","$f0", "$f2"
+ );
+#else
+ u = f * (1 << n_bits);
+ u -= (u >> n_bits);
+#endif
+ }
return u;
}
static force_inline float
unorm_to_float (uint16_t u, int n_bits)
{
+ float result;
+#ifdef USE_LOONGSON_MMI
+ asm(".set noreorder\r\n"
+ "li $8, 0x1\r\n"
+ "sll $8, %2\r\n"
+ "addu $8, -1\r\n"
+ "mtc1 $8, $f8\r\n"
+ "cvt.s.w $f2, $f8\r\n"
+ "and $8,%1\r\n"
+ "mtc1 $8, $f6\r\n"
+ "cvt.s.w $f4, $f6\r\n"
+ "recip.s $f0, $f2\r\n"
+ "mul.s %0,$f0,$f4\r\n"
+
+ ".set reorder\r\n"
+ : "=f" (result)
+ : "r"(u), "r" (n_bits)
+ : "$8","$f0", "$f2","$f4","$f6","$f8"
+ );
+ return result;
+#else
uint32_t m = ((1 << n_bits) - 1);
return (u & m) * (1.f / (float)m);
+#endif
}
/*
@@ -206,8 +251,8 @@ pixman_contract_from_float (uint32_t *dst,
for (i = 0; i < width; ++i)
{
- uint8_t a, r, g, b;
-
+ uint8_t a, r, g, b;
+
a = float_to_unorm (src[i].a, 8);
r = float_to_unorm (src[i].r, 8);
g = float_to_unorm (src[i].g, 8);
diff --git a/test/Makefile.am b/test/Makefile.am
index 88dc36d..43cafb8 100644
--- a/test/Makefile.am
+++ b/test/Makefile.am
@@ -2,7 +2,7 @@ include $(top_srcdir)/test/Makefile.sources
AM_CFLAGS = $(OPENMP_CFLAGS) $(PTHREAD_CFLAGS)
AM_LDFLAGS = $(OPENMP_CFLAGS) $(TESTPROGS_EXTRA_LDFLAGS) $(PTHREAD_LDFLAGS)
-LDADD = libutils.la $(top_builddir)/pixman/libpixman-1.la -lm $(PNG_LIBS) $(PTHREAD_LIBS)
+LDADD = libutils.la $(top_builddir)/pixman/libpixman-1.la $(top_builddir)/pixman/libpixman-loongson-mmi.la -lm $(PNG_LIBS) $(PTHREAD_LIBS)
AM_CPPFLAGS = -I$(top_srcdir)/pixman -I$(top_builddir)/pixman $(PNG_CFLAGS)
libutils_la_SOURCES = $(libutils_sources) $(libutils_headers)
diff --git a/test/utils.c b/test/utils.c
index f8e42a5..73ddb6f 100644
--- a/test/utils.c
+++ b/test/utils.c
@@ -877,7 +877,15 @@ fuzzer_test_main (const char *test_name,
#endif
for (i = n1; i <= n2; i++)
{
+#ifdef USE_LOONGSON_MMI
+ uint32_t crc;
+ #pragma omp critical
+ {
+ crc = call_test_function (test_function, i, 0);
+ }
+#else
uint32_t crc = call_test_function (test_function, i, 0);
+#endif
if (verbose)
printf ("%d: %08X\n", i, crc);
checksum += crc;
--
2.1.0
More information about the Pixman
mailing list