[PATCH] Added MIPS32R2 and MIPS DSP ASE optimized functions.

Wed Sep 8 17:34:22 PDT 2010

The following functions were implemented for MIPS32R2:
  - pixman_fill32()
  - fast_composite_over_n_8_8888()

The following functions were implemented for MIPS DSP ASE:
  - combine_over_u()
  - fast_composite_over_n_8_8888()

Additionally, MIPS DSP ASE uses the MIPS32R2 pixman_fill32() function.

Use configure commands similar to the ones below to select the target
processor and, correspondingly, the target instruction set:

  - MIPS32R2: configure CFLAGS='-march=24kc -O2'
  - MIPS DSP ASE: configure CFLAGS='-march=24kec -O2'
---
 configure.ac                     |   63 +++++++++++++
 pixman/Makefile.am               |   22 +++++
 pixman/pixman-cpu.c              |   21 ++++
 pixman/pixman-mips-dspase1-asm.S |  189 ++++++++++++++++++++++++++++++++++++++
 pixman/pixman-mips-dspase1.c     |  107 +++++++++++++++++++++
 pixman/pixman-mips32r2-asm.S     |  180 ++++++++++++++++++++++++++++++++++++
 pixman/pixman-mips32r2.c         |  112 ++++++++++++++++++++++
 pixman/pixman-private.h          |   11 ++
 8 files changed, 705 insertions(+), 0 deletions(-)
 create mode 100644 pixman/pixman-mips-dspase1-asm.S
 create mode 100644 pixman/pixman-mips-dspase1.c
 create mode 100644 pixman/pixman-mips32r2-asm.S
 create mode 100644 pixman/pixman-mips32r2.c

diff --git a/configure.ac b/configure.ac
index 5242799..2a7e49a 100644
--- a/configure.ac
+++ b/configure.ac
@@ -565,6 +565,69 @@ fi
 
 AM_CONDITIONAL(USE_GCC_INLINE_ASM, test $have_gcc_inline_asm = yes)
 
+dnl ==========================================================================
+dnl Check if the compiler supports MIPS32R2 instructions
+
+AC_MSG_CHECKING(whether to use MIPS32R2 instructions)
+AC_COMPILE_IFELSE([[
+void test()
+{
+        asm("ext \$v0,\$a0,8,8");
+}
+]], have_mips32r2=yes, have_mips32r2=no)
+
+AC_ARG_ENABLE(mips32r2,
+   [AC_HELP_STRING([--disable-mips32r2],
+                   [disable MIPS32R2 fast paths])],
+   [enable_mips32r2=$enableval], [enable_mips32r2=auto])
+
+if test $enable_mips32r2 = no ; then
+   have_mips32r2=disabled
+fi
+
+if test $have_mips32r2 = yes ; then
+   AC_DEFINE(USE_MIPS32R2, 1, [use MIPS32R2 optimizations])
+fi
+
+AM_CONDITIONAL(USE_MIPS32R2, test $have_mips32r2 = yes)
+
+AC_MSG_RESULT($have_mips32r2)
+if test $enable_mips32r2 = yes && test $have_mips32r2 = no ; then
+   AC_MSG_ERROR([MIPS32R2 not detected])
+fi
+
+
+dnl ==========================================================================
+dnl Check if the compiler supports MIPS DSP ASE Rev 1 instructions
+
+AC_MSG_CHECKING(whether to use MIPS DSP ASE Rev 1 instructions)
+AC_COMPILE_IFELSE([[
+void test()
+{
+        asm("addu.qb \$v0,\$a0,\$a1");
+}
+]], have_mips_dspase1=yes, have_mips_dspase1=no)
+
+AC_ARG_ENABLE(mips-dspase1,
+   [AC_HELP_STRING([--disable-mips-dspase1],
+                   [disable MIPS DSP ASE Rev 1 fast paths])],
+   [enable_mips_dspase1=$enableval], [enable_mips_dspase1=auto])
+
+if test $enable_mips_dspase1 = no ; then
+   have_mips_dspase1=disabled
+fi
+
+if test $have_mips_dspase1 = yes ; then
+   AC_DEFINE(USE_MIPS_DSPASE1, 1, [use MIPS DSP ASE Rev 1 optimizations])
+fi
+
+AM_CONDITIONAL(USE_MIPS_DSPASE1, test $have_mips_dspase1 = yes)
+
+AC_MSG_RESULT($have_mips_dspase1)
+if test $enable_mips_dspase1 = yes && test $have_mips_dspase1 = no ; then
+   AC_MSG_ERROR([MIPS DSP ASE Rev 1 not detected])
+fi
+
 dnl ==============================================
 dnl Static test programs
 
diff --git a/pixman/Makefile.am b/pixman/Makefile.am
index ca31301..d832db1 100644
--- a/pixman/Makefile.am
+++ b/pixman/Makefile.am
@@ -123,5 +123,27 @@ libpixman_1_la_LIBADD += libpixman-arm-neon.la
 ASM_CFLAGS_arm_neon=
 endif
 
+# MIPS32R2
+if USE_MIPS32R2
+noinst_LTLIBRARIES += libpixman-mips32r2.la
+libpixman_mips32r2_la_SOURCES = \
+	pixman-mips32r2.c \
+	pixman-mips32r2-asm.S
+libpixman_mips32r2_la_CFLAGS = $(DEP_CFLAGS)
+libpixman_mips32r2_la_LIBADD = $(DEP_LIBS)
+libpixman_1_la_LIBADD += libpixman-mips32r2.la
+endif
+
+# MIPS DSP ASE Rev 1
+if USE_MIPS_DSPASE1
+noinst_LTLIBRARIES += libpixman-mips-dspase1.la
+libpixman_mips_dspase1_la_SOURCES = \
+	pixman-mips-dspase1.c \
+	pixman-mips-dspase1-asm.S
+libpixman_mips_dspase1_la_CFLAGS = $(DEP_CFLAGS)
+libpixman_mips_dspase1_la_LIBADD = $(DEP_LIBS)
+libpixman_1_la_LIBADD += libpixman-mips-dspase1.la
+endif
+
 .c.s : $(libpixmaninclude_HEADERS) $(BUILT_SOURCES)
 	$(CC) $(CFLAGS) $(ASM_CFLAGS_$(@:pixman-%.s=%)) $(ASM_CFLAGS_$(@:pixman-arm-%.s=arm_%)) -DHAVE_CONFIG_H -I$(srcdir) -I$(builddir) -I$(top_builddir) -S -o $@ $<
diff --git a/pixman/pixman-cpu.c b/pixman/pixman-cpu.c
index 0e14ecb..ee6dc1c 100644
--- a/pixman/pixman-cpu.c
+++ b/pixman/pixman-cpu.c
@@ -573,6 +573,17 @@ pixman_have_sse2 (void)
 #endif /* __amd64__ */
 #endif
 
+#ifdef USE_MIPS32R2
+// note: no runtime check for MIPS32R2 support
+#define pixman_have_mips32r2() TRUE
+#endif
+
+#ifdef USE_MIPS_DSPASE1
+// note: no runtime check for MIPS DSP ASE Rev 1 support
+#define pixman_have_mips_dspase1() TRUE
+#endif
+
+
 pixman_implementation_t *
 _pixman_choose_implementation (void)
 {
@@ -606,6 +617,16 @@ _pixman_choose_implementation (void)
 	imp = _pixman_implementation_create_vmx (imp);
 #endif
 
+#ifdef USE_MIPS32R2
+    if (pixman_have_mips32r2 ())
+	imp = _pixman_implementation_create_mips32r2 (imp);
+#endif
+
+#ifdef USE_MIPS_DSPASE1
+    if (pixman_have_mips_dspase1 ())
+	imp = _pixman_implementation_create_mips_dspase1 (imp);
+#endif
+
     return imp;
 }
 
diff --git a/pixman/pixman-mips-dspase1-asm.S b/pixman/pixman-mips-dspase1-asm.S
new file mode 100644
index 0000000..b96fe83
--- /dev/null
+++ b/pixman/pixman-mips-dspase1-asm.S
@@ -0,0 +1,189 @@
+
+	.text
+	.set		noreorder
+	.set		nomacro
+
+
+// void
+// mips_dspase1_combine_over_u_nomask(uint32_t *dest, const uint32_t *src,
+//	const uint32_t *mask, int width)
+
+	.global		mips_dspase1_combine_over_u_nomask
+	.ent		mips_dspase1_combine_over_u_nomask
+
+// note: this version to be used only when mask = NULL
+
+mips_dspase1_combine_over_u_nomask:
+	beqz		$a3, 1f
+	subu		$v0, $a1, $a0	// diff = src - dest (for LWX)
+
+	sll		$a3, $a3, 2	// width <<= 2
+	addu		$a3, $a0, $a3	// dest_end = dest + width
+
+	lw		$t0, 0($a0)	// dest
+	lwx		$t1, $v0($a0)	// src (dest + diff)
+
+	li		$t9, 0x00800080
+
+0:
+	not		$t2, $t1	// ~src
+	srl		$t2, $t2, 24	// ALPHA_8(~src)
+	ins		$t2, $t2, 16, 8	// 0:a:0:a; equivalent to replv.ph
+
+	muleu_s.ph.qbl	$t3, $t0, $t2
+	muleu_s.ph.qbr	$t4, $t0, $t2
+
+	lw		$t0, 4($a0)	// dest[1] for next loop iteration
+	addiu		$a0, $a0, 4	// dest++
+
+	addu		$t3, $t3, $t9	// can't overflow; rev2: addu_s.ph
+	addu		$t4, $t4, $t9	// can't overflow; rev2: addu_s.ph
+	preceu.ph.qbla	$t5, $t3	// rev2: shrl.ph
+	preceu.ph.qbla	$t6, $t4	// rev2: shrl.ph
+	addu		$t3, $t3, $t5	// can't overflow; rev2: addu_s.ph
+	addu		$t4, $t4, $t6	// can't overflow; rev2: addu_s.ph
+
+	precrq.qb.ph	$t3, $t3, $t4
+	addu_s.qb	$t3, $t3, $t1
+
+	lwx		$t1, $v0($a0)	// src (dest + diff) for next loop iteration
+
+	bne		$a0, $a3, 0b
+	sw		$t3, -4($a0)	// dest
+
+1:
+	jr		$ra
+	nop
+
+	.end		mips_dspase1_combine_over_u_nomask
+
+
+// void
+// mips_dspase1_combine_over_u_mask(uint32_t *dest, const uint32_t *src,
+//	const uint32_t *mask, int width)
+
+	.global		mips_dspase1_combine_over_u_mask
+	.ent		mips_dspase1_combine_over_u_mask
+
+// note: this version to be used only when mask != NULL
+
+mips_dspase1_combine_over_u_mask:
+	beqz		$a3, 1f
+	subu		$v0, $a1, $a0	// sdiff = src - dest (for LWX)
+
+	subu		$v1, $a2, $a0	// mdiff = mask - dest (for LWX)
+
+	sll		$a3, $a3, 2	// width <<= 2
+	addu		$a3, $a0, $a3	// dest_end = dest + width
+
+	li		$t9, 0x00800080
+
+0:
+	lwx		$t8, $v1($a0)	// mask (dest + mdiff)
+	lwx		$t1, $v0($a0)	// src (dest + sdiff)
+
+	srl		$t8, $t8, 24	// mask >>= A_SHIFT
+	ins		$t8, $t8, 16, 8	// 0:m:0:m; equivalent to replv.ph
+
+	muleu_s.ph.qbl	$t3, $t1, $t8
+	muleu_s.ph.qbr	$t4, $t1, $t8
+
+	lw		$t0, 0($a0)	// dest
+	
+	addu		$t3, $t3, $t9	// can't overflow; rev2: addu_s.ph
+	addu		$t4, $t4, $t9	// can't overflow; rev2: addu_s.ph
+	preceu.ph.qbla	$t5, $t3	// rev2: shrl.ph
+	preceu.ph.qbla	$t6, $t4	// rev2: shrl.ph
+	addu		$t3, $t3, $t5	// can't overflow; rev2: addu_s.ph
+	addu		$t4, $t4, $t6	// can't overflow; rev2: addu_s.ph
+	precrq.qb.ph	$t1, $t3, $t4
+
+	not		$t2, $t1	// ~src
+	srl		$t2, $t2, 24	// ALPHA_8(~src)
+	ins		$t2, $t2, 16, 8	// 0:a:0:a; equivalent to replv.ph
+
+	muleu_s.ph.qbl	$t3, $t0, $t2
+	muleu_s.ph.qbr	$t4, $t0, $t2
+
+	addiu		$a0, $a0, 4	// dest++
+
+	addu		$t3, $t3, $t9	// can't overflow; rev2: addu_s.ph
+	addu		$t4, $t4, $t9	// can't overflow; rev2: addu_s.ph
+	preceu.ph.qbla	$t5, $t3	// rev2: shrl.ph
+	preceu.ph.qbla	$t6, $t4	// rev2: shrl.ph
+	addu		$t3, $t3, $t5	// can't overflow; rev2: addu_s.ph
+	addu		$t4, $t4, $t6	// can't overflow; rev2: addu_s.ph
+	precrq.qb.ph	$t3, $t3, $t4
+	addu_s.qb	$t3, $t3, $t1
+
+	bne		$a0, $a3, 0b
+	sw		$t3, -4($a0)	// dest
+
+1:
+	jr		$ra
+	nop
+
+	.end		mips_dspase1_combine_over_u_mask
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+// void
+// mips_dspase1_composite_over_n_8_8888_inner(uint32_t *dest, const uint32_t src,
+//	const uint8_t *mask, int width)
+
+	.global		mips_dspase1_composite_over_n_8_8888_inner
+	.ent		mips_dspase1_composite_over_n_8_8888_inner
+
+mips_dspase1_composite_over_n_8_8888_inner:
+	beqz		$a3, 1f
+	sll		$a3, $a3, 2	// width <<= 2
+
+	addu		$a3, $a0, $a3	// dest_end = dest + width
+
+	li		$t9, 0x00800080
+
+0:
+	lbu		$t8, 0($a2)	// mask
+	lw		$t0, 0($a0)	// dest
+	ins		$t8, $t8, 16, 8	// 0:m:0:m; equivalent to replv.ph
+
+	muleu_s.ph.qbl	$t3, $a1, $t8
+	muleu_s.ph.qbr	$t4, $a1, $t8
+
+	addiu		$a0, $a0, 4	// dest++
+	addiu		$a2, $a2, 1	// mask++
+
+	addu		$t3, $t3, $t9	// can't overflow; rev2: addu_s.ph
+	addu		$t4, $t4, $t9	// can't overflow; rev2: addu_s.ph
+	preceu.ph.qbla	$t5, $t3	// rev2: shrl.ph
+	preceu.ph.qbla	$t6, $t4	// rev2: shrl.ph
+	addu		$t3, $t3, $t5	// can't overflow; rev2: addu_s.ph
+	addu		$t4, $t4, $t6	// can't overflow; rev2: addu_s.ph
+	precrq.qb.ph	$t1, $t3, $t4	// in(src,m)
+
+	not		$t2, $t1	// ~in(src,m)
+	srl		$t2, $t2, 24
+	ins		$t2, $t2, 16, 8	// 0:a:0:a; equivalent to replv.ph
+
+	muleu_s.ph.qbl	$t3, $t0, $t2
+	muleu_s.ph.qbr	$t4, $t0, $t2
+
+	addu		$t3, $t3, $t9	// can't overflow; rev2: addu_s.ph
+	addu		$t4, $t4, $t9	// can't overflow; rev2: addu_s.ph
+	preceu.ph.qbla	$t5, $t3	// rev2: shrl.ph
+	preceu.ph.qbla	$t6, $t4	// rev2: shrl.ph
+	addu		$t3, $t3, $t5	// can't overflow; rev2: addu_s.ph
+	addu		$t4, $t4, $t6	// can't overflow; rev2: addu_s.ph
+	precrq.qb.ph	$t3, $t3, $t4
+	addu_s.qb	$t3, $t3, $t1	// over(in(src,m),dest)
+
+	bne		$a0, $a3, 0b
+	sw		$t3, -4($a0)	// dest
+
+1:
+	jr		$ra
+	nop
+
+	.end		mips_dspase1_composite_over_n_8_8888_inner
+
diff --git a/pixman/pixman-mips-dspase1.c b/pixman/pixman-mips-dspase1.c
new file mode 100644
index 0000000..59722d2
--- /dev/null
+++ b/pixman/pixman-mips-dspase1.c
@@ -0,0 +1,107 @@
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include "pixman-private.h"
+
+
+// assembly-language functions
+
+void
+mips_dspase1_combine_over_u_nomask(uint32_t *dest, const uint32_t *src,
+								   const uint32_t *mask, int width);
+
+void
+mips_dspase1_combine_over_u_mask(uint32_t *dest, const uint32_t *src,
+								 const uint32_t *mask, int width);
+
+void
+mips_dspase1_composite_over_n_8_8888_inner(uint32_t *dest, uint32_t src,
+										   const uint8_t *mask, int width);
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+
+static void
+mips_dspase1_combine_over_u(pixman_implementation_t *imp,
+							pixman_op_t              op,
+							uint32_t *               dest,
+							const uint32_t *         src,
+							const uint32_t *         mask, 
+							int                      width)
+{
+	if (mask)
+	{
+//		_pixman_implementation_combine_32(imp->delegate, op, dest, src, mask, width);
+		mips_dspase1_combine_over_u_mask(dest, src, mask, width);
+	}
+	else
+	{
+//		_pixman_implementation_combine_32(imp->delegate, op, dest, src, mask, width);
+		mips_dspase1_combine_over_u_nomask(dest, src, mask, width);
+	}
+}
+
+
+static void
+mips_dspase1_fast_composite_over_n_8_8888(pixman_implementation_t *imp,
+										  pixman_op_t              op,
+										  pixman_image_t *         src_image,
+										  pixman_image_t *         mask_image,
+										  pixman_image_t *         dst_image,
+										  int32_t                  src_x,
+										  int32_t                  src_y,
+										  int32_t                  mask_x,
+										  int32_t                  mask_y,
+										  int32_t                  dest_x,
+										  int32_t                  dest_y,
+										  int32_t                  width,
+										  int32_t                  height)
+{
+    uint32_t src, srca;
+    uint32_t *dst_line, *dst;
+    uint8_t  *mask_line, *mask;
+    int dst_stride, mask_stride;
+
+    src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
+
+    srca = src >> 24;
+    if (src == 0)
+		return;
+
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+    while (height--)
+    {
+		dst = dst_line;
+		dst_line += dst_stride;
+		mask = mask_line;
+		mask_line += mask_stride;
+
+		mips_dspase1_composite_over_n_8_8888_inner(dst, src, mask, width);
+    }
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+
+static const pixman_fast_path_t mips_dspase1_fast_paths[] =
+{
+	PIXMAN_STD_FAST_PATH(OVER, solid, a8, a8r8g8b8, mips_dspase1_fast_composite_over_n_8_8888),
+    { PIXMAN_OP_NONE }
+};
+
+
+pixman_implementation_t *
+_pixman_implementation_create_mips_dspase1 (pixman_implementation_t *delegate)
+{
+    pixman_implementation_t *imp =
+		_pixman_implementation_create (delegate, mips_dspase1_fast_paths);
+
+	imp->combine_32[PIXMAN_OP_OVER] = mips_dspase1_combine_over_u;
+
+    return imp;
+}
diff --git a/pixman/pixman-mips32r2-asm.S b/pixman/pixman-mips32r2-asm.S
new file mode 100644
index 0000000..e5b4a6c
--- /dev/null
+++ b/pixman/pixman-mips32r2-asm.S
@@ -0,0 +1,180 @@
+
+	.text
+	.set		noreorder
+	.set		nomacro
+
+
+// pixman_bool_t
+// mips32r2_pixman_fill32(uint32_t *bits, int stride, int x, int y,
+//	int width, int height, uint32_t  xor)
+
+	.global		mips32r2_pixman_fill32
+	.ent		mips32r2_pixman_fill32
+
+mips32r2_pixman_fill32:
+	mul		$a3, $a1, $a3
+	addu		$a3, $a3, $a2
+	sll		$a3, $a3, 2
+	addu		$a0, $a0, $a3	// bits = bits + y * stride + x
+
+	lw		$a2, 16($sp)	// width
+	lw		$a3, 20($sp)	// height
+	lw		$v0, 24($sp)	// xor
+
+	li		$t0, ~7
+	beqz		$a3, 5f		// exit if height = 0
+	and		$t0, $a2, $t0	// width8 = width & ~7
+
+	sll		$a1, $a1, 2	// stride <<= 2
+	sll		$t0, $t0, 2	// width8 <<= 2
+	sll		$a2, $a2, 2	// width <<= 2
+
+0:
+	move		$t1, $a0	// b = bits
+	addu		$t2, $t1, $t0	// b + width8
+
+	beq		$t1, $t2, 2f	// skip unrolled loop if not enough samples
+	addu		$t3, $t1, $a2	// b + width
+
+1:
+	sw		$v0, 0($t1)
+	sw		$v0, 4($t1)
+	sw		$v0, 8($t1)
+	sw		$v0,12($t1)
+	sw		$v0,16($t1)
+	sw		$v0,20($t1)
+	sw		$v0,24($t1)
+
+	addiu		$t1, $t1, 32	// b += 8
+	bne		$t1, $t2, 1b	// b = (bits + width8)?
+	sw		$v0, -4($t1)
+
+2:
+	beq		$t1, $t3, 4f	// skip single-sample loop if all work done
+	addiu		$a3, $a3, -1	// height--
+
+3:
+	addiu		$t1, $t1, 4
+	bne		$t1, $t3, 3b	// b = (bits + width)?
+	sw		$v0, -4($t1)
+
+4:
+	bnez		$a3, 0b
+	addu		$a0, $a0, $a1	// bits += stride
+
+5:
+	jr		$ra
+	li		$v0, 1
+
+	.end		mips32r2_pixman_fill32
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+// void
+// mips32r2_composite_over_n_8_8888_inner(uint32_t *dest, const uint32_t src,
+//	const uint8_t *mask, int width)
+
+	.global		mips32r2_composite_over_n_8_8888_inner
+	.ent		mips32r2_composite_over_n_8_8888_inner
+
+mips32r2_composite_over_n_8_8888_inner:
+	beqz		$a3, 1f
+	sll		$a3, $a3, 2	// width <<= 2
+
+	addu		$a3, $a0, $a3	// dest_end = dest + width
+
+	li		$t7, 0x01000100
+	li		$t8, 0x00FF00FF	// RB_MASK
+	li		$t9, 0x00800080
+
+0:
+	lbu		$t2, 0($a2)	// mask
+
+	// in()
+
+	and		$t5, $a1, $t8
+	mul		$t3, $t5, $t2
+
+	lw		$t0, 0($a0)	// dest
+	addiu		$a2, $a2, 1	// mask++
+
+	srl		$t6, $a1, 8
+	and		$t6, $t6, $t8
+	mul		$t4, $t6, $t2
+
+	addu		$t3, $t3, $t9
+	srl		$t5, $t3, 8
+	and		$t5, $t5, $t8
+	addu		$t3, $t3, $t5
+	srl		$t3, $t3, 8
+	and		$t3, $t3, $t8
+
+	addu		$t4, $t4, $t9
+	srl		$t6, $t4, 8
+	and		$t6, $t6, $t8
+	addu		$t4, $t4, $t6
+	srl		$t4, $t4, 8
+	and		$t4, $t4, $t8
+
+	sll		$t4, $t4, 8
+	or		$t1, $t3, $t4
+
+	
+	not		$t2, $t1	// ~in()
+	srl		$t2, $t2, 24
+
+	// over(): UN8_rb_MUL_UN8() and UN8_rb_ADD_UN8_rb()
+
+	and		$t5, $t0, $t8
+	mul		$t3, $t5, $t2
+
+	addiu		$a0, $a0, 4	// dest++
+
+	srl		$t6, $t0, 8
+	and		$t6, $t6, $t8
+	mul		$t4, $t6, $t2
+
+	addu		$t3, $t3, $t9
+	srl		$t5, $t3, 8
+	and		$t5, $t5, $t8
+	addu		$t3, $t3, $t5
+	srl		$t3, $t3, 8
+	and		$t3, $t3, $t8
+
+	and		$t5, $t1, $t8
+	addu		$t3, $t3, $t5
+	srl		$t5, $t3, 8
+	and		$t5, $t5, $t8
+	subu		$t5, $t7, $t5
+	or		$t3, $t3, $t5
+	and		$t3, $t3, $t8
+
+	addu		$t4, $t4, $t9
+	srl		$t6, $t4, 8
+	and		$t6, $t6, $t8
+	addu		$t4, $t4, $t6
+	srl		$t4, $t4, 8
+	and		$t4, $t4, $t8
+
+	srl		$t6, $t1, 8
+	and		$t6, $t6, $t8
+	addu		$t4, $t4, $t6
+	srl		$t6, $t4, 8
+	and		$t6, $t6, $t8
+	subu		$t6, $t7, $t6
+	or		$t4, $t4, $t6
+	and		$t4, $t4, $t8
+
+	sll		$t4, $t4, 8
+	or		$t3, $t3, $t4
+	
+	bne		$a0, $a3, 0b
+	sw		$t3, -4($a0)	// dest
+
+1:
+	jr		$ra
+	nop
+
+	.end		mips32r2_composite_over_n_8_8888_inner
+
diff --git a/pixman/pixman-mips32r2.c b/pixman/pixman-mips32r2.c
new file mode 100644
index 0000000..ec56a18
--- /dev/null
+++ b/pixman/pixman-mips32r2.c
@@ -0,0 +1,112 @@
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include "pixman-private.h"
+
+
+// assembly-language functions
+
+pixman_bool_t
+mips32r2_pixman_fill32(uint32_t *bits, int stride, int x, int y,
+					   int width, int height, uint32_t  xor);
+
+void
+mips32r2_composite_over_n_8_8888_inner(uint32_t *dest, uint32_t src,
+									   const uint8_t *mask, int width);
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+
+static pixman_bool_t
+mips32r2_fill(pixman_implementation_t *imp,
+			  uint32_t *               bits,
+			  int                      stride,
+			  int                      bpp,
+			  int                      x,
+			  int                      y,
+			  int                      width,
+			  int                      height,
+			  uint32_t                 xor)
+{
+	pixman_bool_t b;
+
+	switch (bpp)
+	{
+	case 32:
+//		b = _pixman_implementation_fill(imp->delegate, bits, stride, bpp, x, y, width, height, xor);
+		b = mips32r2_pixman_fill32(bits, stride, x, y, width, height, xor);
+		break;
+
+	default:
+		b = _pixman_implementation_fill(imp->delegate, bits, stride, bpp,
+										x, y, width, height, xor);
+		break;
+	}
+
+	return b;
+}
+
+
+static void
+mips32r2_fast_composite_over_n_8_8888(pixman_implementation_t *imp,
+									  pixman_op_t              op,
+									  pixman_image_t *         src_image,
+									  pixman_image_t *         mask_image,
+									  pixman_image_t *         dst_image,
+									  int32_t                  src_x,
+									  int32_t                  src_y,
+									  int32_t                  mask_x,
+									  int32_t                  mask_y,
+									  int32_t                  dest_x,
+									  int32_t                  dest_y,
+									  int32_t                  width,
+									  int32_t                  height)
+{
+    uint32_t src, srca;
+    uint32_t *dst_line, *dst;
+    uint8_t  *mask_line, *mask;
+    int dst_stride, mask_stride;
+
+    src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
+
+    srca = src >> 24;
+    if (src == 0)
+		return;
+
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+    while (height--)
+    {
+		dst = dst_line;
+		dst_line += dst_stride;
+		mask = mask_line;
+		mask_line += mask_stride;
+
+		mips32r2_composite_over_n_8_8888_inner(dst, src, mask, width);
+    }
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+
+static const pixman_fast_path_t mips32r2_fast_paths[] =
+{
+	PIXMAN_STD_FAST_PATH(OVER, solid, a8, a8r8g8b8, mips32r2_fast_composite_over_n_8_8888),
+    { PIXMAN_OP_NONE }
+};
+
+
+pixman_implementation_t *
+_pixman_implementation_create_mips32r2 (pixman_implementation_t *delegate)
+{
+    pixman_implementation_t *imp =
+		_pixman_implementation_create (delegate, mips32r2_fast_paths);
+
+    imp->fill = mips32r2_fill;
+
+    return imp;
+}
diff --git a/pixman/pixman-private.h b/pixman/pixman-private.h
index 1473dc4..813598f 100644
--- a/pixman/pixman-private.h
+++ b/pixman/pixman-private.h
@@ -567,6 +567,17 @@ pixman_implementation_t *
 _pixman_implementation_create_vmx (pixman_implementation_t *fallback);
 #endif
 
+#ifdef USE_MIPS32R2
+pixman_implementation_t *
+_pixman_implementation_create_mips32r2 (pixman_implementation_t *delegate);
+#endif
+
+#ifdef USE_MIPS_DSPASE1
+pixman_implementation_t *
+_pixman_implementation_create_mips_dspase1 (pixman_implementation_t *delegate);
+#endif
+
+
 pixman_implementation_t *
 _pixman_choose_implementation (void);
 
-- 
1.7.0.4