[Pixman] [PATCH 2/3] DSPASE Cleanup and add operations
Veli-Matti Valtonen
veli-matti.valtonen at movial.com
Tue Feb 22 01:05:56 PST 2011
MIPS: DSPASE Modified the original commit dspase to use arm-neon bind macro
MIPS: DSPASE Implemented add_8888_8888 and add_n_8888
MIPS: DSPASE Added some simple mips function begin/end macroes.
MIPS: DSPASE Implemented scanline add.
---
pixman/pixman-mips-dspase1-asm.S | 331 ++++++++++++++++++++++++++++++++------
pixman/pixman-mips-dspase1.c | 75 +++++----
2 files changed, 325 insertions(+), 81 deletions(-)
diff --git a/pixman/pixman-mips-dspase1-asm.S b/pixman/pixman-mips-dspase1-asm.S
index b96fe83..596b38a 100644
--- a/pixman/pixman-mips-dspase1-asm.S
+++ b/pixman/pixman-mips-dspase1-asm.S
@@ -1,27 +1,37 @@
-
.text
+ .set mips32r2
+ .set nomips16
+ .set dsp
+
+.macro pixman_asm_func fname
+ .global \fname
+ .ent \fname
+#ifdef __ELF__
+ .type \fname, @function
+ .hidden \fname
+#endif
+\fname:
+.endm
+
+.macro pixman_end_func fname
+ .end \fname
+ .size \fname, .-\fname
+.endm
+
.set noreorder
.set nomacro
-
-// void
-// mips_dspase1_combine_over_u_nomask(uint32_t *dest, const uint32_t *src,
-// const uint32_t *mask, int width)
-
- .global mips_dspase1_combine_over_u_nomask
- .ent mips_dspase1_combine_over_u_nomask
-
// note: this version to be used only when mask = NULL
-mips_dspase1_combine_over_u_nomask:
- beqz $a3, 1f
- subu $v0, $a1, $a0 // diff = src - dest (for LWX)
+pixman_asm_func pixman_composite_scanline_over_asm_dspase1
+ beqz $a0, 1f
+ subu $v0, $a2, $a1 // diff = src - dest (for LWX)
- sll $a3, $a3, 2 // width <<= 2
- addu $a3, $a0, $a3 // dest_end = dest + width
+ sll $a0, $a0, 2 // width <<= 2
+ addu $a0, $a1, $a0 // dest_end = dest + width
- lw $t0, 0($a0) // dest
- lwx $t1, $v0($a0) // src (dest + diff)
+ lw $t0, 0($a1) // dest
+ lwx $t1, $v0($a1) // src (dest + diff)
li $t9, 0x00800080
@@ -33,8 +43,8 @@ mips_dspase1_combine_over_u_nomask:
muleu_s.ph.qbl $t3, $t0, $t2
muleu_s.ph.qbr $t4, $t0, $t2
- lw $t0, 4($a0) // dest[1] for next loop iteration
- addiu $a0, $a0, 4 // dest++
+ lw $t0, 4($a1) // dest[1] for next loop iteration
+ addiu $a1, $a1, 4 // dest++
addu $t3, $t3, $t9 // can't overflow; rev2: addu_s.ph
addu $t4, $t4, $t9 // can't overflow; rev2: addu_s.ph
@@ -46,41 +56,34 @@ mips_dspase1_combine_over_u_nomask:
precrq.qb.ph $t3, $t3, $t4
addu_s.qb $t3, $t3, $t1
- lwx $t1, $v0($a0) // src (dest + diff) for next loop iteration
+ lwx $t1, $v0($a1) // src (dest + diff) for next loop iteration
- bne $a0, $a3, 0b
- sw $t3, -4($a0) // dest
+ bne $a1, $a0, 0b
+ sw $t3, -4($a1) // dest
1:
jr $ra
nop
- .end mips_dspase1_combine_over_u_nomask
-
+pixman_end_func pixman_composite_scanline_over_asm_dspase1
-// void
-// mips_dspase1_combine_over_u_mask(uint32_t *dest, const uint32_t *src,
-// const uint32_t *mask, int width)
-
- .global mips_dspase1_combine_over_u_mask
- .ent mips_dspase1_combine_over_u_mask
// note: this version to be used only when mask != NULL
-mips_dspase1_combine_over_u_mask:
- beqz $a3, 1f
- subu $v0, $a1, $a0 // sdiff = src - dest (for LWX)
+pixman_asm_func pixman_composite_scanline_over_mask_asm_dspase1
+ beqz $a0, 1f
+ subu $v0, $a2, $a1 // sdiff = src - dest (for LWX)
- subu $v1, $a2, $a0 // mdiff = mask - dest (for LWX)
+ subu $v1, $a3, $a1 // mdiff = mask - dest (for LWX)
- sll $a3, $a3, 2 // width <<= 2
- addu $a3, $a0, $a3 // dest_end = dest + width
+ sll $a0, $a0, 2 // width <<= 2
+ addu $a0, $a1, $a0 // dest_end = dest + width
li $t9, 0x00800080
0:
- lwx $t8, $v1($a0) // mask (dest + mdiff)
- lwx $t1, $v0($a0) // src (dest + sdiff)
+ lwx $t8, $v1($a1) // mask (dest + mdiff)
+ lwx $t1, $v0($a1) // src (dest + sdiff)
srl $t8, $t8, 24 // mask >>= A_SHIFT
ins $t8, $t8, 16, 8 // 0:m:0:m; equivalent to replv.ph
@@ -88,7 +91,7 @@ mips_dspase1_combine_over_u_mask:
muleu_s.ph.qbl $t3, $t1, $t8
muleu_s.ph.qbr $t4, $t1, $t8
- lw $t0, 0($a0) // dest
+ lw $t0, 0($a1) // dest
addu $t3, $t3, $t9 // can't overflow; rev2: addu_s.ph
addu $t4, $t4, $t9 // can't overflow; rev2: addu_s.ph
@@ -105,7 +108,7 @@ mips_dspase1_combine_over_u_mask:
muleu_s.ph.qbl $t3, $t0, $t2
muleu_s.ph.qbr $t4, $t0, $t2
- addiu $a0, $a0, 4 // dest++
+ addiu $a1, $a1, 4 // dest++
addu $t3, $t3, $t9 // can't overflow; rev2: addu_s.ph
addu $t4, $t4, $t9 // can't overflow; rev2: addu_s.ph
@@ -116,26 +119,120 @@ mips_dspase1_combine_over_u_mask:
precrq.qb.ph $t3, $t3, $t4
addu_s.qb $t3, $t3, $t1
- bne $a0, $a3, 0b
- sw $t3, -4($a0) // dest
+ bne $a1, $a0, 0b
+ sw $t3, -4($a1) // dest
1:
jr $ra
nop
- .end mips_dspase1_combine_over_u_mask
+pixman_end_func pixman_composite_scanline_over_mask_asm_dspase1
+
+# Scanline add, no mask
+pixman_asm_func pixman_composite_scanline_add_asm_dspase1
+ beqz $a0, $scanline_add_exit
+ sll $a0, $a0, 2 # Number of 8bit blocks (For addressing)
+
+ move $t9, $a0
+ ins $t9, $zero, 0, 4 # Number of 8*4*4 blocks
+
+ addu $t8, $a0, $a1
+
+ beqz $t9, $scanline_add_no_main_loop
+ addu $t4, $t9, $a1 # end ptr for dst
+
+$scanline_add_width_loop:
+ lw $t2, 0($a1) # dst
+ lw $t3, 0($a2) # src
+ lw $v0, 4($a1)
+ lw $t7, 4($a2)
+ lw $v1, 8($a1)
+ lw $t0, 8($a2)
+ lw $t6, 12($a1)
+ lw $t1, 12($a2)
+
+ addiu $a1, $a1, 16
+ addiu $a2, $a2, 16
+
+ addu_s.qb $t2, $t2, $t3
+ sw $t2, -16($a1)
+ addu_s.qb $v0, $v0, $t7
+ sw $v0, -12($a1)
+ addu_s.qb $v1, $v1, $t0
+ sw $v1, -8($a1)
+ addu_s.qb $t6, $t6, $t1
+
+ bne $a1, $t4, $scanline_add_width_loop
+ sw $t6, -4($a1)
+$scanline_add_no_main_loop:
+ beq $t8, $a1, $scanline_add_exit
+ nop
+$scanline_add_leftover_loop:
+ lw $t2, 0($a1)
+ lw $t3, 0($a2)
+ addiu $a1, $a1, 4
+ addiu $a2, $a2, 4
+
+ addu_s.qb $t2, $t2, $t3
-////////////////////////////////////////////////////////////////////////////////
+ bne $a1, $t8, $scanline_add_leftover_loop
+ sw $t2, -4($a1)
+$scanline_add_exit:
+ jr $ra
+ nop
+pixman_end_func pixman_composite_scanline_add_asm_dspase1
+
+# Scanline add, mask
+pixman_asm_func pixman_composite_scanline_add_mask_asm_dspase1
+ beqz $a0, $scanline_add_mask_exit
+ sll $a0, $a0, 2 # Number of 8bit blocks (For addressing)
+
+ li $t8, 0x00800080
+
+ subu $a2, $a2, $a1 // sdiff = src - dest (for LWX)
+ subu $a3, $a3, $a1 // mdiff = mask - dest (for LWX)
+
+ addu $t9, $a1, $a0
+$scanline_add_mask_loop:
+ lwx $t2, $a3($a1)
+ lwx $t1, $a2($a1)
+ lw $t0, 0($a1)
-// void
-// mips_dspase1_composite_over_n_8_8888_inner(uint32_t *dest, const uint32_t src,
-// const uint8_t *mask, int width)
+ addiu $a1, $a1, 4
+
+ # based on pixman_composite_scanline_over_mask_asm_dspase1
+ # converting these to macroes might make sense
+ srl $t2, $t2, 24
+ ins $t2, $t2, 16, 8 // 0:m:0:m; equivalent to replv.ph
+
+ muleu_s.ph.qbl $t3, $t1, $t2
+ muleu_s.ph.qbr $t4, $t1, $t2
+
+ addu $t3, $t3, $t8 // can't overflow; rev2: addu_s.ph
+ addu $t4, $t4, $t8 // can't overflow; rev2: addu_s.ph
+
+ preceu.ph.qbla $t5, $t3 // rev2: shrl.ph
+ preceu.ph.qbla $t6, $t4 // rev2: shrl.ph
+ addu $t3, $t3, $t5 // can't overflow; rev2: addu_s.ph
+ addu $t4, $t4, $t6 // can't overflow; rev2: addu_s.ph
+
+ precrq.qb.ph $t1, $t3, $t4
+
+ addu_s.qb $t0, $t0, $t1
+
+ bne $a1, $t9, $scanline_add_mask_loop
+ sw $t0, -4($a1)
+$scanline_add_mask_exit:
+ jr $ra
+ nop
- .global mips_dspase1_composite_over_n_8_8888_inner
- .ent mips_dspase1_composite_over_n_8_8888_inner
+pixman_end_func pixman_composite_scanline_add_mask_asm_dspase1
-mips_dspase1_composite_over_n_8_8888_inner:
+
+////////////////////////////////////////////////////////////////////////////////
+
+pixman_asm_func mips_dspase1_composite_over_n_8_8888_inner
beqz $a3, 1f
sll $a3, $a3, 2 // width <<= 2
@@ -185,5 +282,139 @@ mips_dspase1_composite_over_n_8_8888_inner:
jr $ra
nop
- .end mips_dspase1_composite_over_n_8_8888_inner
+pixman_end_func mips_dspase1_composite_over_n_8_8888_inner
+
+pixman_asm_func pixman_composite_add_8888_8888_asm_dspase1
+ lw $v0, 16($sp) # src
+ lw $v1, 20($sp) # src_stride
+
+ beqz $a1, $add_8888_8888_exit
+ addiu $sp, $sp, -8
+
+ sw $s0, 0($sp)
+ sw $s1, 4($sp)
+
+ sll $a3, $a3, 2
+ sll $v1, $v1, 2
+ sll $a0, $a0, 2 # Number of 8bit blocks (For addressing)
+
+ subu $v1, $v1, $a0 # stride - width
+ subu $a3, $a3, $a0 # stride - width
+
+ move $t9, $a0
+ ins $t9, $zero, 0, 4 # Number of 8*4*4 blocks
+
+$add_8888_8888_height_loop:
+ addu $t8, $a0, $a2
+
+ beqz $t9, $add_8888_8888_no_main_loop
+ addu $t4, $t9, $a2 # end ptr for dst
+
+$add_8888_8888_width_loop:
+ lw $t2, 0($a2) # dst
+ lw $t3, 0($v0) # src
+ lw $s0, 4($a2)
+ lw $t7, 4($v0)
+ lw $s1, 8($a2)
+ lw $t0, 8($v0)
+ lw $t6, 12($a2)
+ lw $t1, 12($v0)
+
+ addiu $a2, $a2, 16
+ addiu $v0, $v0, 16
+
+ addu_s.qb $t2, $t2, $t3
+ sw $t2, -16($a2)
+ addu_s.qb $s0, $s0, $t7
+ sw $s0, -12($a2)
+ addu_s.qb $s1, $s1, $t0
+ sw $s1, -8($a2)
+ addu_s.qb $t6, $t6, $t1
+
+ bne $a2, $t4, $add_8888_8888_width_loop
+ sw $t6, -4($a2)
+$add_8888_8888_no_main_loop:
+ beq $t8, $a2, $add_8888_8888_no_leftover
+ addiu $a1, $a1, -1 # Decrement height
+$add_8888_8888_leftover_loop:
+ lw $t2, 0($a2)
+ lw $t3, 0($v0)
+
+ addiu $a2, $a2, 4
+ addiu $v0, $v0, 4
+
+ addu_s.qb $t2, $t2, $t3
+
+ bne $a2, $t8, $add_8888_8888_leftover_loop
+ sw $t2, -4($a2)
+$add_8888_8888_no_leftover:
+ addu $v0, $v0, $v1 # src += src_stride
+
+ bnez $a1, $add_8888_8888_height_loop
+ addu $a2, $a2, $a3 # dst += dst_stride
+
+ lw $s0, 0($sp)
+ lw $s1, 4($sp)
+
+$add_8888_8888_exit:
+ jr $ra
+ addiu $sp, $sp, 8
+pixman_end_func pixman_composite_add_8888_8888_asm_dspase1
+
+pixman_asm_func pixman_composite_add_n_8888_asm_dspase1
+ lw $v0, 16($sp) # Src
+
+ beqz $a1, $add_n_8888_exit
+ sll $a3, $a3, 2 # Dst stride
+ sll $a0, $a0, 2 # Number of 8bit blocks (For addressing)
+
+ subu $a3, $a3, $a0
+
+ move $t9, $a0
+ ins $t9, $zero, 0, 4 # Number of 8*4*4 blocks
+
+$add_n_8888_height_loop:
+ addu $t8, $a0, $a2
+
+ beqz $t9, $add_n_8888_no_main_loop
+ addu $t4, $t9, $a2 # end ptr for dst
+
+$add_n_8888_width_loop:
+ lw $t2, 0($a2) # dst
+ lw $t0, 4($a2)
+ lw $t1, 8($a2)
+ lw $t7, 12($a2)
+
+ addiu $a2, $a2, 16
+
+ addu_s.qb $t2, $t2, $v0
+ sw $t2, -16($a2)
+ addu_s.qb $t0, $t0, $v0
+ sw $t0, -12($a2)
+ addu_s.qb $t1, $t1, $v0
+ sw $t1, -8($a2)
+ addu_s.qb $t7, $t7, $v0
+
+ bne $a2, $t4, $add_n_8888_width_loop
+ sw $t7, -4($a2)
+$add_n_8888_no_main_loop:
+ beq $t8, $a2, $add_n_8888_no_leftover
+ addiu $a1, $a1, -1 # Decrement height
+$add_n_8888_leftover_loop:
+ lw $t2, 0($a2)
+
+ addiu $a2, $a2, 4 # Moving this anywhere else will cause a stall in store
+ addu_s.qb $t2, $t2, $v0
+
+ bne $a2, $t8, $add_n_8888_leftover_loop
+ sw $t2, -4($a2)
+$add_n_8888_no_leftover:
+
+ bnez $a1, $add_n_8888_height_loop
+ addu $a2, $a2, $a3 # dst += dst_stride
+
+$add_n_8888_exit:
+ jr $ra
+ nop
+pixman_end_func pixman_composite_add_n_8888_asm_dspase1
diff --git a/pixman/pixman-mips-dspase1.c b/pixman/pixman-mips-dspase1.c
index 59722d2..0ab3f87 100644
--- a/pixman/pixman-mips-dspase1.c
+++ b/pixman/pixman-mips-dspase1.c
@@ -3,47 +3,22 @@
#endif
#include "pixman-private.h"
+#include "pixman-arm-common.h"
// assembly-language functions
void
-mips_dspase1_combine_over_u_nomask(uint32_t *dest, const uint32_t *src,
- const uint32_t *mask, int width);
-
-void
-mips_dspase1_combine_over_u_mask(uint32_t *dest, const uint32_t *src,
- const uint32_t *mask, int width);
-
-void
mips_dspase1_composite_over_n_8_8888_inner(uint32_t *dest, uint32_t src,
const uint8_t *mask, int width);
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST(dspase1, add_8888_8888,
+ uint32_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_N_DST(SKIP_ZERO_SRC, dspase1, add_n_8888,
+ uint32_t, 1)
////////////////////////////////////////////////////////////////////////////////
-
-static void
-mips_dspase1_combine_over_u(pixman_implementation_t *imp,
- pixman_op_t op,
- uint32_t * dest,
- const uint32_t * src,
- const uint32_t * mask,
- int width)
-{
- if (mask)
- {
-// _pixman_implementation_combine_32(imp->delegate, op, dest, src, mask, width);
- mips_dspase1_combine_over_u_mask(dest, src, mask, width);
- }
- else
- {
-// _pixman_implementation_combine_32(imp->delegate, op, dest, src, mask, width);
- mips_dspase1_combine_over_u_nomask(dest, src, mask, width);
- }
-}
-
-
static void
mips_dspase1_fast_composite_over_n_8_8888(pixman_implementation_t *imp,
pixman_op_t op,
@@ -84,6 +59,35 @@ mips_dspase1_fast_composite_over_n_8_8888(pixman_implementation_t *imp,
}
}
+#define BIND_COMBINE_U(name) \
+void \
+pixman_composite_scanline_##name##_mask_asm_dspase1 (int32_t w, \
+ const uint32_t *dst, \
+ const uint32_t *src, \
+ const uint32_t *mask); \
+ \
+void \
+pixman_composite_scanline_##name##_asm_dspase1 (int32_t w, \
+ const uint32_t *dst, \
+ const uint32_t *src); \
+ \
+static void \
+dspase1_combine_##name##_u (pixman_implementation_t *imp, \
+ pixman_op_t op, \
+ uint32_t * dest, \
+ const uint32_t * src, \
+ const uint32_t * mask, \
+ int width) \
+{ \
+ if (mask) \
+ pixman_composite_scanline_##name##_mask_asm_dspase1 (width, dest, \
+ src, mask); \
+ else \
+ pixman_composite_scanline_##name##_asm_dspase1 (width, dest, src); \
+}
+
+BIND_COMBINE_U (over)
+BIND_COMBINE_U (add)
////////////////////////////////////////////////////////////////////////////////
@@ -91,6 +95,14 @@ mips_dspase1_fast_composite_over_n_8_8888(pixman_implementation_t *imp,
static const pixman_fast_path_t mips_dspase1_fast_paths[] =
{
PIXMAN_STD_FAST_PATH(OVER, solid, a8, a8r8g8b8, mips_dspase1_fast_composite_over_n_8_8888),
+
+ PIXMAN_STD_FAST_PATH (ADD, solid, null, a8r8g8b8, dspase1_composite_add_n_8888 ),
+ PIXMAN_STD_FAST_PATH (ADD, solid, null, x8r8g8b8, dspase1_composite_add_n_8888 ),
+
+ PIXMAN_STD_FAST_PATH (ADD, x8r8g8b8, null, x8r8g8b8, dspase1_composite_add_8888_8888 ),
+ PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, dspase1_composite_add_8888_8888 ),
+ PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, dspase1_composite_add_8888_8888 ),
+
{ PIXMAN_OP_NONE }
};
@@ -101,7 +113,8 @@ _pixman_implementation_create_mips_dspase1 (pixman_implementation_t *delegate)
pixman_implementation_t *imp =
_pixman_implementation_create (delegate, mips_dspase1_fast_paths);
- imp->combine_32[PIXMAN_OP_OVER] = mips_dspase1_combine_over_u;
+ imp->combine_32[PIXMAN_OP_OVER] = dspase1_combine_over_u;
+ imp->combine_32[PIXMAN_OP_ADD] = dspase1_combine_add_u;
return imp;
}
--
1.7.0.4
More information about the Pixman
mailing list