[Pixman] [PATCH 3/3] DSPASE More cleanup, out reverse op.
Veli-Matti Valtonen
veli-matti.valtonen at movial.com
Tue Feb 22 01:05:57 PST 2011
MIPS: DSPASE Implemented DSPASE1_UN8x4_MUL_UN8 macro.
MIPS: DSPASE Implemented scanline out reverse
MIPS: DSPASE over_n_8_8888 modified to use the macro bindings
---
pixman/pixman-mips-dspase1-asm.S | 226 +++++++++++++++++++++++++-------------
pixman/pixman-mips-dspase1.c | 50 +--------
2 files changed, 155 insertions(+), 121 deletions(-)
diff --git a/pixman/pixman-mips-dspase1-asm.S b/pixman/pixman-mips-dspase1-asm.S
index 596b38a..0cb2293 100644
--- a/pixman/pixman-mips-dspase1-asm.S
+++ b/pixman/pixman-mips-dspase1-asm.S
@@ -18,6 +18,26 @@
.size \fname, .-\fname
.endm
+# result register can be the same as any of the params
+# rb_half should contain 0x00800080
+.macro DSPASE1_UN8x4_MUL_UN8_head a, b, x, y
+ muleu_s.ph.qbl \x, \a, \b
+ muleu_s.ph.qbr \y, \a, \b
+.endm
+
+.macro DSPASE1_UN8x4_MUL_UN8_tail x, y, result, rb_half, tmp3, tmp4
+ addu \x, \x, \rb_half
+ addu \y, \y, \rb_half
+
+ preceu.ph.qbla \tmp3, \x
+ preceu.ph.qbla \tmp4, \y
+
+ addu \x, \x, \tmp3
+ addu \y, \y, \tmp4
+
+ precrq.qb.ph \result, \x, \y
+.endm
+
.set noreorder
.set nomacro
@@ -40,20 +60,13 @@ pixman_asm_func pixman_composite_scanline_over_asm_dspase1
srl $t2, $t2, 24 // ALPHA_8(~src)
ins $t2, $t2, 16, 8 // 0:a:0:a; equivalent to replv.ph
- muleu_s.ph.qbl $t3, $t0, $t2
- muleu_s.ph.qbr $t4, $t0, $t2
+ DSPASE1_UN8x4_MUL_UN8_head $t0, $t2, $t3, $t4
lw $t0, 4($a1) // dest[1] for next loop iteration
addiu $a1, $a1, 4 // dest++
- addu $t3, $t3, $t9 // can't overflow; rev2: addu_s.ph
- addu $t4, $t4, $t9 // can't overflow; rev2: addu_s.ph
- preceu.ph.qbla $t5, $t3 // rev2: shrl.ph
- preceu.ph.qbla $t6, $t4 // rev2: shrl.ph
- addu $t3, $t3, $t5 // can't overflow; rev2: addu_s.ph
- addu $t4, $t4, $t6 // can't overflow; rev2: addu_s.ph
+ DSPASE1_UN8x4_MUL_UN8_tail $t3, $t4, $t3, $t9, $t5, $t6
- precrq.qb.ph $t3, $t3, $t4
addu_s.qb $t3, $t3, $t1
lwx $t1, $v0($a1) // src (dest + diff) for next loop iteration
@@ -88,35 +101,22 @@ pixman_asm_func pixman_composite_scanline_over_mask_asm_dspase1
srl $t8, $t8, 24 // mask >>= A_SHIFT
ins $t8, $t8, 16, 8 // 0:m:0:m; equivalent to replv.ph
- muleu_s.ph.qbl $t3, $t1, $t8
- muleu_s.ph.qbr $t4, $t1, $t8
+ DSPASE1_UN8x4_MUL_UN8_head $t1, $t8, $t3, $t4
lw $t0, 0($a1) // dest
- addu $t3, $t3, $t9 // can't overflow; rev2: addu_s.ph
- addu $t4, $t4, $t9 // can't overflow; rev2: addu_s.ph
- preceu.ph.qbla $t5, $t3 // rev2: shrl.ph
- preceu.ph.qbla $t6, $t4 // rev2: shrl.ph
- addu $t3, $t3, $t5 // can't overflow; rev2: addu_s.ph
- addu $t4, $t4, $t6 // can't overflow; rev2: addu_s.ph
- precrq.qb.ph $t1, $t3, $t4
+ DSPASE1_UN8x4_MUL_UN8_tail $t3, $t4, $t1, $t9, $t5, $t6
not $t2, $t1 // ~src
srl $t2, $t2, 24 // ALPHA_8(~src)
ins $t2, $t2, 16, 8 // 0:a:0:a; equivalent to replv.ph
- muleu_s.ph.qbl $t3, $t0, $t2
- muleu_s.ph.qbr $t4, $t0, $t2
+ DSPASE1_UN8x4_MUL_UN8_head $t0, $t2, $t3, $t4
addiu $a1, $a1, 4 // dest++
+
+ DSPASE1_UN8x4_MUL_UN8_tail $t3, $t4, $t3, $t9, $t5, $t6
- addu $t3, $t3, $t9 // can't overflow; rev2: addu_s.ph
- addu $t4, $t4, $t9 // can't overflow; rev2: addu_s.ph
- preceu.ph.qbla $t5, $t3 // rev2: shrl.ph
- preceu.ph.qbla $t6, $t4 // rev2: shrl.ph
- addu $t3, $t3, $t5 // can't overflow; rev2: addu_s.ph
- addu $t4, $t4, $t6 // can't overflow; rev2: addu_s.ph
- precrq.qb.ph $t3, $t3, $t4
addu_s.qb $t3, $t3, $t1
bne $a1, $a0, 0b
@@ -197,28 +197,18 @@ pixman_asm_func pixman_composite_scanline_add_mask_asm_dspase1
$scanline_add_mask_loop:
lwx $t2, $a3($a1)
lwx $t1, $a2($a1)
- lw $t0, 0($a1)
-
- addiu $a1, $a1, 4
# based on pixman_composite_scanline_over_mask_asm_dspase1
- # converting these to macroes might make sense
srl $t2, $t2, 24
ins $t2, $t2, 16, 8 // 0:m:0:m; equivalent to replv.ph
-
- muleu_s.ph.qbl $t3, $t1, $t2
- muleu_s.ph.qbr $t4, $t1, $t2
-
- addu $t3, $t3, $t8 // can't overflow; rev2: addu_s.ph
- addu $t4, $t4, $t8 // can't overflow; rev2: addu_s.ph
- preceu.ph.qbla $t5, $t3 // rev2: shrl.ph
- preceu.ph.qbla $t6, $t4 // rev2: shrl.ph
- addu $t3, $t3, $t5 // can't overflow; rev2: addu_s.ph
- addu $t4, $t4, $t6 // can't overflow; rev2: addu_s.ph
+ DSPASE1_UN8x4_MUL_UN8_head $t1, $t2, $t3, $t4
- precrq.qb.ph $t1, $t3, $t4
+ lw $t0, 0($a1)
+ addiu $a1, $a1, 4
+ DSPASE1_UN8x4_MUL_UN8_tail $t3, $t4, $t1, $t8, $t5, $t6
+
addu_s.qb $t0, $t0, $t1
bne $a1, $t9, $scanline_add_mask_loop
@@ -229,60 +219,144 @@ $scanline_add_mask_exit:
pixman_end_func pixman_composite_scanline_add_mask_asm_dspase1
+# Scanline out reverse, no mask
+pixman_asm_func pixman_composite_scanline_out_reverse_asm_dspase1
+ beqz $a0, $scanline_out_reverse_exit
+ sll $a0, $a0, 2 # Number of 8bit blocks (For addressing)
+
+ li $t8, 0x00800080
+
+ subu $a2, $a2, $a1 // sdiff = src - dest (for LWX)
+
+ addu $t9, $a1, $a0
+$scanline_out_reverse_loop:
+ lwx $t1, $a2($a1)
+ lw $t0, 0($a1)
+
+ not $t1, $t1
+ srl $t1, $t1, 24 # src
+ ins $t1, $t1, 16, 8 // 0:m:0:m; equivalent to replv.ph
+
+ DSPASE1_UN8x4_MUL_UN8_head $t0, $t1, $t3, $t4
+
+ addiu $a1, $a1, 4
+
+ DSPASE1_UN8x4_MUL_UN8_tail $t3, $t4, $t0, $t8, $t5, $t6
+
+ bne $a1, $t9, $scanline_out_reverse_loop
+ sw $t0, -4($a1)
+$scanline_out_reverse_exit:
+ jr $ra
+ nop
+pixman_end_func pixman_composite_scanline_out_reverse_asm_dspase1
+
+# Scanline out reverse, mask
+pixman_asm_func pixman_composite_scanline_out_reverse_mask_asm_dspase1
+ beqz $a0, $scanline_out_reverse_mask_exit
+ sll $a0, $a0, 2 # Number of 8bit blocks (For addressing)
+
+ li $t8, 0x00800080
+
+ subu $a2, $a2, $a1 // sdiff = src - dest (for LWX)
+ subu $a3, $a3, $a1 // mdiff = mask - dest (for LWX)
+
+ addu $t9, $a1, $a0
+$scanline_out_reverse_mask_loop:
+ lwx $t2, $a3($a1)
+ lwx $t1, $a2($a1)
+
+ # combine mask
+ srl $t2, $t2, 24 # mask
+ srl $t1, $t1, 24 # src
+
+ mul $t3, $t2, $t1
+
+ lw $t0, 0($a1)
+
+ addiu $t3, $t3, 0x80
+ srl $t4, $t3, 8
+ addu $t3, $t3, $t4
+ srl $t3, $t3, 8
+ # mask combined
+
+ not $t1, $t3
+ andi $t1, $t1, 0xff
+ ins $t1, $t1, 16, 8 // 0:m:0:m; equivalent to replv.ph
+
+ DSPASE1_UN8x4_MUL_UN8_head $t0, $t1, $t3, $t4
+
+ addiu $a1, $a1, 4
+
+ DSPASE1_UN8x4_MUL_UN8_tail $t3, $t4, $t0, $t8, $t5, $t6
+
+ bne $a1, $t9, $scanline_out_reverse_mask_loop
+ sw $t0, -4($a1)
+$scanline_out_reverse_mask_exit:
+ jr $ra
+ nop
+pixman_end_func pixman_composite_scanline_out_reverse_mask_asm_dspase1
+
+
////////////////////////////////////////////////////////////////////////////////
-pixman_asm_func mips_dspase1_composite_over_n_8_8888_inner
- beqz $a3, 1f
- sll $a3, $a3, 2 // width <<= 2
+pixman_asm_func pixman_composite_over_n_8_8888_asm_dspase1
+ lw $v0, 16($sp) # src
+ # 20($sp) is unused
+ lw $v1, 24($sp) # mask
+ lw $t7, 28($sp) # mask_stride
+
+ beqz $a0, $over_n_8_8888_end
+ addiu $sp, $sp, -4
+
+ sw $s0, 0($sp)
+
+ subu $t7, $t7, $a0 # mask 8bit stride - width
- addu $a3, $a0, $a3 // dest_end = dest + width
+ sll $a0, $a0, 2 // width <<= 2
+ sll $a3, $a3, 2 # dst <<= 2
+
+ subu $a3, $a3, $a0 # dst stride - width
li $t9, 0x00800080
-0:
- lbu $t8, 0($a2) // mask
- lw $t0, 0($a0) // dest
+$over_n_8_8888_height_loop:
+ addu $s0, $a0, $a2 # dst end
+ addiu $a1, $a1, -1
+
+$over_n_8_8888_width_loop:
+ lbu $t8, 0($v1) // mask
+ lw $t0, 0($a2) // dest
ins $t8, $t8, 16, 8 // 0:m:0:m; equivalent to replv.ph
- muleu_s.ph.qbl $t3, $a1, $t8
- muleu_s.ph.qbr $t4, $a1, $t8
+ DSPASE1_UN8x4_MUL_UN8_head $v0, $t8, $t3, $t4
- addiu $a0, $a0, 4 // dest++
- addiu $a2, $a2, 1 // mask++
+ addiu $a2, $a2, 4 // dest++
+ addiu $v1, $v1, 1 // mask++
- addu $t3, $t3, $t9 // can't overflow; rev2: addu_s.ph
- addu $t4, $t4, $t9 // can't overflow; rev2: addu_s.ph
- preceu.ph.qbla $t5, $t3 // rev2: shrl.ph
- preceu.ph.qbla $t6, $t4 // rev2: shrl.ph
- addu $t3, $t3, $t5 // can't overflow; rev2: addu_s.ph
- addu $t4, $t4, $t6 // can't overflow; rev2: addu_s.ph
- precrq.qb.ph $t1, $t3, $t4 // in(src,m)
+ DSPASE1_UN8x4_MUL_UN8_tail $t3, $t4, $t1, $t9, $t5, $t6
not $t2, $t1 // ~in(src,m)
srl $t2, $t2, 24
ins $t2, $t2, 16, 8 // 0:a:0:a; equivalent to replv.ph
- muleu_s.ph.qbl $t3, $t0, $t2
- muleu_s.ph.qbr $t4, $t0, $t2
+ DSPASE1_UN8x4_MUL_UN8_head $t0, $t2, $t3, $t4
+ DSPASE1_UN8x4_MUL_UN8_tail $t3, $t4, $t3, $t9, $t5, $t6
- addu $t3, $t3, $t9 // can't overflow; rev2: addu_s.ph
- addu $t4, $t4, $t9 // can't overflow; rev2: addu_s.ph
- preceu.ph.qbla $t5, $t3 // rev2: shrl.ph
- preceu.ph.qbla $t6, $t4 // rev2: shrl.ph
- addu $t3, $t3, $t5 // can't overflow; rev2: addu_s.ph
- addu $t4, $t4, $t6 // can't overflow; rev2: addu_s.ph
- precrq.qb.ph $t3, $t3, $t4
addu_s.qb $t3, $t3, $t1 // over(in(src,m),dest)
- bne $a0, $a3, 0b
- sw $t3, -4($a0) // dest
+ bne $a2, $s0, $over_n_8_8888_width_loop
+ sw $t3, -4($a2) // dest
-1:
+ addu $a2, $a2, $a3
+ bnez $a1, $over_n_8_8888_height_loop
+ addu $v1, $v1, $t7
+
+ lw $s0, 0($sp)
+$over_n_8_8888_end:
jr $ra
- nop
-
-pixman_end_func mips_dspase1_composite_over_n_8_8888_inner
+ addiu $sp, $sp, 4
+pixman_end_func pixman_composite_over_n_8_8888_asm_dspase1
pixman_asm_func pixman_composite_add_8888_8888_asm_dspase1
lw $v0, 16($sp) # src
@@ -403,7 +477,7 @@ $add_n_8888_no_main_loop:
$add_n_8888_leftover_loop:
lw $t2, 0($a2)
- addiu $a2, $a2, 4 # Moving this anywhere else will cause a stall in store
+ addiu $a2, $a2, 4
addu_s.qb $t2, $t2, $v0
bne $a2, $t8, $add_n_8888_leftover_loop
diff --git a/pixman/pixman-mips-dspase1.c b/pixman/pixman-mips-dspase1.c
index 0ab3f87..b53c3df 100644
--- a/pixman/pixman-mips-dspase1.c
+++ b/pixman/pixman-mips-dspase1.c
@@ -8,10 +8,8 @@
// assembly-language functions
-void
-mips_dspase1_composite_over_n_8_8888_inner(uint32_t *dest, uint32_t src,
- const uint8_t *mask, int width);
-
+PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST(SKIP_ZERO_SRC, dspase1, over_n_8_8888,
+ uint8_t, 1, uint32_t, 1)
PIXMAN_ARM_BIND_FAST_PATH_SRC_DST(dspase1, add_8888_8888,
uint32_t, 1, uint32_t, 1)
PIXMAN_ARM_BIND_FAST_PATH_N_DST(SKIP_ZERO_SRC, dspase1, add_n_8888,
@@ -19,46 +17,6 @@ PIXMAN_ARM_BIND_FAST_PATH_N_DST(SKIP_ZERO_SRC, dspase1, add_n_8888,
////////////////////////////////////////////////////////////////////////////////
-static void
-mips_dspase1_fast_composite_over_n_8_8888(pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- uint32_t src, srca;
- uint32_t *dst_line, *dst;
- uint8_t *mask_line, *mask;
- int dst_stride, mask_stride;
-
- src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
-
- srca = src >> 24;
- if (src == 0)
- return;
-
- PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
- PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- mask = mask_line;
- mask_line += mask_stride;
-
- mips_dspase1_composite_over_n_8_8888_inner(dst, src, mask, width);
- }
-}
-
#define BIND_COMBINE_U(name) \
void \
pixman_composite_scanline_##name##_mask_asm_dspase1 (int32_t w, \
@@ -88,13 +46,14 @@ dspase1_combine_##name##_u (pixman_implementation_t *imp, \
BIND_COMBINE_U (over)
BIND_COMBINE_U (add)
+BIND_COMBINE_U (out_reverse)
////////////////////////////////////////////////////////////////////////////////
static const pixman_fast_path_t mips_dspase1_fast_paths[] =
{
- PIXMAN_STD_FAST_PATH(OVER, solid, a8, a8r8g8b8, mips_dspase1_fast_composite_over_n_8_8888),
+ PIXMAN_STD_FAST_PATH(OVER, solid, a8, a8r8g8b8, dspase1_composite_over_n_8_8888 ),
PIXMAN_STD_FAST_PATH (ADD, solid, null, a8r8g8b8, dspase1_composite_add_n_8888 ),
PIXMAN_STD_FAST_PATH (ADD, solid, null, x8r8g8b8, dspase1_composite_add_n_8888 ),
@@ -115,6 +74,7 @@ _pixman_implementation_create_mips_dspase1 (pixman_implementation_t *delegate)
imp->combine_32[PIXMAN_OP_OVER] = dspase1_combine_over_u;
imp->combine_32[PIXMAN_OP_ADD] = dspase1_combine_add_u;
+ imp->combine_32[PIXMAN_OP_OUT_REVERSE] = dspase1_combine_out_reverse_u;
return imp;
}
--
1.7.0.4
More information about the Pixman
mailing list