[Pixman] [PATCH 2/2] MIPS: DSPr2: Added fast-paths for SRC operation.
Nemanja Lukic
nlukic at mips.com
Fri Feb 10 07:02:24 PST 2012
From: Nemanja Lukic <nemanja.lukic at rt-rk.com>
Following fast-path functions are implemented (routines 4, 5 and 6 utilize
same fast-memcpy routine):
1. src_x888_8888
2. src_8888_0565
3. src_0565_8888
4. src_0565_0565
5. src_8888_8888
6. src_0888_0888
Performance numbers before/after on MIPS-74kc @ 1GHz
Referent (before):
lowlevel-blt-bench:
src_x888_8888 = L1: 199.35 L2: 96.54 M: 18.87 (100.68%) HT: 17.12 VT: 16.24 R: 15.43 RT: 9.33 ( 61Kops/s)
src_8888_0565 = L1: 71.22 L2: 51.95 M: 24.19 ( 96.17%) HT: 20.71 VT: 19.92 R: 18.15 RT: 9.92 ( 63Kops/s)
src_0565_8888 = L1: 38.82 L2: 36.22 M: 18.60 ( 73.95%) HT: 14.47 VT: 13.19 R: 12.97 RT: 6.61 ( 49Kops/s)
src_0565_0565 = L1: 286.05 L2: 155.02 M: 37.68 (100.54%) HT: 31.08 VT: 28.07 R: 26.26 RT: 11.93 ( 68Kops/s)
src_8888_8888 = L1: 454.32 L2: 139.15 M: 19.30 (102.98%) HT: 17.73 VT: 16.08 R: 16.62 RT: 10.45 ( 64Kops/s)
src_0888_0888 = L1: 190.47 L2: 106.14 M: 25.26 (101.08%) HT: 21.88 VT: 20.32 R: 18.83 RT: 10.10 ( 63Kops/s)
cairo-perf-trace:
[ # ] backend test min(s) median(s) stddev. count
[ # ] image: pixman 0.25.1
[ 0] image firefox-asteroids 421.215 421.325 0.01% 4/6
[ 1] image firefox-planet-gnome 647.708 648.486 0.13% 6/6
[ 2] image gnome-system-monitor 276.073 277.506 0.38% 6/6
[ 3] image gnome-terminal-vim 263.866 265.229 0.39% 6/6
[ 4] image poppler 123.576 124.003 0.15% 6/6
Optimized (with these optimizations):
lowlevel-blt-bench:
src_x888_8888 = L1: 369.50 L2: 99.37 M: 27.19 (145.07%) HT: 20.24 VT: 19.48 R: 19.00 RT: 10.22 ( 63Kops/s)
src_8888_0565 = L1: 105.65 L2: 67.87 M: 25.41 (101.00%) HT: 20.78 VT: 19.84 R: 18.52 RT: 9.81 ( 63Kops/s)
src_0565_8888 = L1: 77.10 L2: 63.04 M: 23.37 ( 92.90%) HT: 20.29 VT: 19.37 R: 18.14 RT: 10.02 ( 63Kops/s)
src_0565_0565 = L1: 519.02 L2: 241.32 M: 62.35 (166.34%) HT: 33.74 VT: 27.63 R: 26.12 RT: 11.70 ( 67Kops/s)
src_8888_8888 = L1: 390.48 L2: 113.99 M: 30.32 (161.77%) HT: 19.55 VT: 17.05 R: 17.13 RT: 10.19 ( 63Kops/s)
src_0888_0888 = L1: 349.74 L2: 156.68 M: 40.68 (162.78%) HT: 25.58 VT: 20.57 R: 20.20 RT: 9.96 ( 63Kops/s)
cairo-perf-trace:
[ # ] backend test min(s) median(s) stddev. count
[ # ] image: pixman 0.25.1
[ 0] image firefox-asteroids 400.050 400.308 0.04% 6/6
[ 1] image firefox-planet-gnome 628.978 629.364 0.07% 6/6
[ 2] image gnome-system-monitor 270.247 270.313 0.03% 6/6
[ 3] image gnome-terminal-vim 256.413 257.641 0.21% 6/6
[ 4] image poppler 119.540 120.023 0.21% 6/6
---
pixman/Makefile.am | 5 +-
pixman/pixman-mips-dspr2-asm.S | 217 ++++++++++++++++++++++
pixman/pixman-mips-dspr2-asm.h | 179 ++++++++++++++++++
pixman/pixman-mips-dspr2.c | 33 ++++
pixman/pixman-mips-dspr2.h | 46 +++++
pixman/pixman-mips-memcpy-asm.S | 388 +++++++++++++++++++++++++++++++++++++++
6 files changed, 867 insertions(+), 1 deletions(-)
create mode 100644 pixman/pixman-mips-dspr2-asm.S
create mode 100644 pixman/pixman-mips-dspr2-asm.h
create mode 100644 pixman/pixman-mips-memcpy-asm.S
diff --git a/pixman/Makefile.am b/pixman/Makefile.am
index a7fba33..fb7e047 100644
--- a/pixman/Makefile.am
+++ b/pixman/Makefile.am
@@ -107,7 +107,10 @@ if USE_MIPS_DSPR2
noinst_LTLIBRARIES += libpixman-mips-dspr2.la
libpixman_mips_dspr2_la_SOURCES = \
pixman-mips-dspr2.c \
- pixman-mips-dspr2.h
+ pixman-mips-dspr2.h \
+ pixman-mips-dspr2-asm.S \
+ pixman-mips-dspr2-asm.h \
+ pixman-mips-memcpy-asm.S
libpixman_mips_dspr2_la_CFLAGS = $(DEP_CFLAGS)
libpixman_mips_dspr2_la_LIBADD = $(DEP_LIBS)
libpixman_1_la_LIBADD += libpixman-mips-dspr2.la
diff --git a/pixman/pixman-mips-dspr2-asm.S b/pixman/pixman-mips-dspr2-asm.S
new file mode 100644
index 0000000..07ef6b4
--- /dev/null
+++ b/pixman/pixman-mips-dspr2-asm.S
@@ -0,0 +1,217 @@
+/*
+ * Copyright (c) 2011
+ * MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Author: Nemanja Lukic (nlukic at mips.com)
+ */
+
+#include "pixman-mips-dspr2-asm.h"
+
+LEAF(pixman_composite_src_8888_0565_asm_mips)
+/*
+ * a0 - dst (r5g6b5)
+ * a1 - src (a8r8g8b8)
+ * a2 - w
+ */
+
+ .set noreorder
+
+ beqz a2, 3f
+ nop
+ addiu t1, a2, -1
+ beqz t1, 2f
+ nop
+ li t4, 0xf800f800
+ li t5, 0x07e007e0
+ li t6, 0x001f001f
+1:
+ lw t0, 0(a1)
+ lw t1, 4(a1)
+ addiu a1, a1, 8
+ addiu a2, a2, -2
+
+ CONVERT_2x8888_TO_2x0565 t0, t1, t2, t3, t4, t5, t6, t7, t8
+
+ sh t2, 0(a0)
+ sh t3, 2(a0)
+
+ addiu t2, a2, -1
+ bgtz t2, 1b
+ addiu a0, a0, 4
+2:
+ beqz a2, 3f
+ nop
+ lw t0, 0(a1)
+
+ CONVERT_1x8888_TO_1x0565 t0, t1, t2, t3
+
+ sh t1, 0(a0)
+3:
+ j ra
+ nop
+
+ .set reorder
+
+END(pixman_composite_src_8888_0565_asm_mips)
+
+LEAF(pixman_composite_src_0565_8888_asm_mips)
+/*
+ * a0 - dst (a8r8g8b8)
+ * a1 - src (r5g6b5)
+ * a2 - w
+ */
+
+ .set noreorder
+
+ beqz a2, 3f
+ nop
+ addiu t1, a2, -1
+ beqz t1, 2f
+ nop
+ li t4, 0x07e007e0
+ li t5, 0x001F001F
+1:
+ lhu t0, 0(a1)
+ lhu t1, 2(a1)
+ addiu a1, a1, 4
+ addiu a2, a2, -2
+
+ CONVERT_2x0565_TO_2x8888 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9
+
+ sw t2, 0(a0)
+ sw t3, 4(a0)
+
+ addiu t2, a2, -1
+ bgtz t2, 1b
+ addiu a0, a0, 8
+2:
+ beqz a2, 3f
+ nop
+ lhu t0, 0(a1)
+
+ CONVERT_1x0565_TO_1x8888 t0, t1, t2, t3
+
+ sw t1, 0(a0)
+3:
+ j ra
+ nop
+
+ .set reorder
+
+END(pixman_composite_src_0565_8888_asm_mips)
+
+LEAF(pixman_composite_src_x888_8888_asm_mips)
+/*
+ * a0 - dst (a8r8g8b8)
+ * a1 - src (x8r8g8b8)
+ * a2 - w
+ */
+
+ .set noreorder
+
+ beqz a2, 4f
+ nop
+ li t9, 0xff000000
+ srl t8, a2, 3 /* t1 = how many multiples of 8 src pixels */
+ beqz t8, 3f /* branch if less than 8 src pixels */
+ nop
+1:
+ addiu t8, t8, -1
+ beqz t8, 2f
+ addiu a2, a2, -8
+ pref 0, 32(a1)
+ lw t0, 0(a1)
+ lw t1, 4(a1)
+ lw t2, 8(a1)
+ lw t3, 12(a1)
+ lw t4, 16(a1)
+ lw t5, 20(a1)
+ lw t6, 24(a1)
+ lw t7, 28(a1)
+ addiu a1, a1, 32
+ or t0, t0, t9
+ or t1, t1, t9
+ or t2, t2, t9
+ or t3, t3, t9
+ or t4, t4, t9
+ or t5, t5, t9
+ or t6, t6, t9
+ or t7, t7, t9
+ pref 30, 32(a0)
+ sw t0, 0(a0)
+ sw t1, 4(a0)
+ sw t2, 8(a0)
+ sw t3, 12(a0)
+ sw t4, 16(a0)
+ sw t5, 20(a0)
+ sw t6, 24(a0)
+ sw t7, 28(a0)
+ b 1b
+ addiu a0, a0, 32
+2:
+ lw t0, 0(a1)
+ lw t1, 4(a1)
+ lw t2, 8(a1)
+ lw t3, 12(a1)
+ lw t4, 16(a1)
+ lw t5, 20(a1)
+ lw t6, 24(a1)
+ lw t7, 28(a1)
+ addiu a1, a1, 32
+ or t0, t0, t9
+ or t1, t1, t9
+ or t2, t2, t9
+ or t3, t3, t9
+ or t4, t4, t9
+ or t5, t5, t9
+ or t6, t6, t9
+ or t7, t7, t9
+ sw t0, 0(a0)
+ sw t1, 4(a0)
+ sw t2, 8(a0)
+ sw t3, 12(a0)
+ sw t4, 16(a0)
+ sw t5, 20(a0)
+ sw t6, 24(a0)
+ sw t7, 28(a0)
+ beqz a2, 4f
+ addiu a0, a0, 32
+3:
+ lw t0, 0(a1)
+ addiu a1, a1, 4
+ addiu a2, a2, -1
+ or t1, t0, t9
+ sw t1, 0(a0)
+ bnez a2, 3b
+ addiu a0, a0, 4
+4:
+ jr ra
+ nop
+
+ .set reorder
+
+END(pixman_composite_src_x888_8888_asm_mips)
diff --git a/pixman/pixman-mips-dspr2-asm.h b/pixman/pixman-mips-dspr2-asm.h
new file mode 100644
index 0000000..51a0f99
--- /dev/null
+++ b/pixman/pixman-mips-dspr2-asm.h
@@ -0,0 +1,179 @@
+/*
+ * Copyright (c) 2011
+ * MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Author: Nemanja Lukic (nlukic at mips.com)
+ */
+
+#ifndef PIXMAN_MIPS_DSPR2_ASM_H
+#define PIXMAN_MIPS_DSPR2_ASM_H
+
+#include "sys/asm.h"
+
+#define zero $0
+#define AT $1
+#define v0 $2
+#define v1 $3
+#define a0 $4
+#define a1 $5
+#define a2 $6
+#define a3 $7
+#define t0 $8
+#define t1 $9
+#define t2 $10
+#define t3 $11
+#define t4 $12
+#define t5 $13
+#define t6 $14
+#define t7 $15
+#define s0 $16
+#define s1 $17
+#define s2 $18
+#define s3 $19
+#define s4 $20
+#define s5 $21
+#define s6 $22
+#define s7 $23
+#define t8 $24
+#define t9 $25
+#define k0 $26
+#define k1 $27
+#define gp $28
+#define sp $29
+#define fp $30
+#define s8 $30
+#define ra $31
+
+/*
+ * Conversion of single r5g6b5 pixel (in_565) to single a8r8g8b8 pixel
+ * returned in (out_8888) register. Requires two temporary registers
+ * (scratch1 and scratch2).
+ */
+.macro CONVERT_1x0565_TO_1x8888 in_565, \
+ out_8888, \
+ scratch1, scratch2
+ lui \out_8888, 0xff00
+ sll \scratch1, \in_565, 0x3
+ andi \scratch2, \scratch1, 0xff
+ ext \scratch1, \in_565, 0x2, 0x3
+ or \scratch1, \scratch2, \scratch1
+ or \out_8888, \out_8888, \scratch1
+
+ sll \scratch1, \in_565, 0x5
+ andi \scratch1, \scratch1, 0xfc00
+ srl \scratch2, \in_565, 0x1
+ andi \scratch2, \scratch2, 0x300
+ or \scratch2, \scratch1, \scratch2
+ or \out_8888, \out_8888, \scratch2
+
+ andi \scratch1, \in_565, 0xf800
+ srl \scratch2, \scratch1, 0x5
+ andi \scratch2, \scratch2, 0xff00
+ or \scratch1, \scratch1, \scratch2
+ sll \scratch1, \scratch1, 0x8
+ or \out_8888, \out_8888, \scratch1
+.endm
+
+/*
+ * Conversion of two r5g6b5 pixels (in1_565 and in2_565) to two a8r8g8b8 pixels
+ * returned in (out1_8888 and out2_8888) registers. Requires four scratch
+ * registers (scratch1 ... scratch4). It also requires maskG and maskB for
+ * color component extractions. These masks must have following values:
+ * li maskG, 0x07e007e0
+ * li maskB, 0x001F001F
+ */
+.macro CONVERT_2x0565_TO_2x8888 in1_565, in2_565, \
+ out1_8888, out2_8888, \
+ maskG, maskB, \
+ scratch1, scratch2, scratch3, scratch4
+ sll \scratch1, \in1_565, 16
+ or \scratch1, \scratch1, \in2_565
+ lui \out2_8888, 0xff00
+ ori \out2_8888, \out2_8888, 0xff00
+ shrl.ph \scratch2, \scratch1, 11
+ and \scratch3, \scratch1, \maskG
+ shra.ph \scratch4, \scratch2, 2
+ shll.ph \scratch2, \scratch2, 3
+ shll.ph \scratch3, \scratch3, 5
+ or \scratch2, \scratch2, \scratch4
+ shrl.qb \scratch4, \scratch3, 6
+ or \out2_8888, \out2_8888, \scratch2
+ or \scratch3, \scratch3, \scratch4
+ and \scratch1, \scratch1, \maskB
+ shll.ph \scratch2, \scratch1, 3
+ shra.ph \scratch4, \scratch1, 2
+ or \scratch2, \scratch2, \scratch4
+ or \scratch3, \scratch2, \scratch3
+ precrq.ph.w \out1_8888, \out2_8888, \scratch3
+ precr_sra.ph.w \out2_8888, \scratch3, 0
+.endm
+
+/*
+ * Conversion of single a8r8g8b8 pixel (in_8888) to single r5g6b5 pixel
+ * returned in (out_565) register. Requires two temporary registers
+ * (scratch1 and scratch2).
+ */
+.macro CONVERT_1x8888_TO_1x0565 in_8888, \
+ out_565, \
+ scratch1, scratch2
+ ext \out_565, \in_8888, 0x3, 0x5
+ srl \scratch1, \in_8888, 0x5
+ andi \scratch1, \scratch1, 0x07e0
+ srl \scratch2, \in_8888, 0x8
+ andi \scratch2, \scratch2, 0xf800
+ or \out_565, \out_565, \scratch1
+ or \out_565, \out_565, \scratch2
+.endm
+
+/*
+ * Conversion of two a8r8g8b8 pixels (in1_8888 and in2_8888) to two r5g6b5
+ * pixels returned in (out1_565 and out2_565) registers. Requires two temporary
+ * registers (scratch1 and scratch2). It also requires maskR, maskG and maskB
+ * for color component extractions. These masks must have following values:
+ * li maskR, 0xf800f800
+ * li maskG, 0x07e007e0
+ * li maskB, 0x001F001F
+ * Value of input register in2_8888 is lost.
+ */
+.macro CONVERT_2x8888_TO_2x0565 in1_8888, in2_8888, \
+ out1_565, out2_565, \
+ maskR, maskG, maskB, \
+ scratch1, scratch2
+ precrq.ph.w \scratch1, \in2_8888, \in1_8888
+ precr_sra.ph.w \in2_8888, \in1_8888, 0
+ shll.ph \scratch1, \scratch1, 8
+ srl \in2_8888, \in2_8888, 3
+ and \scratch2, \in2_8888, \maskB
+ and \scratch1, \scratch1, \maskR
+ srl \in2_8888, \in2_8888, 2
+ and \out2_565, \in2_8888, \maskG
+ or \out2_565, \out2_565, \scratch2
+ or \out1_565, \out2_565, \scratch1
+ srl \out2_565, \out1_565, 16
+.endm
+
+#endif //PIXMAN_MIPS_DSPR2_ASM_H
diff --git a/pixman/pixman-mips-dspr2.c b/pixman/pixman-mips-dspr2.c
index 518dae1..e331853 100644
--- a/pixman/pixman-mips-dspr2.c
+++ b/pixman/pixman-mips-dspr2.c
@@ -36,8 +36,41 @@
#include "pixman-private.h"
#include "pixman-mips-dspr2.h"
+PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, src_x888_8888,
+ uint32_t, 1, uint32_t, 1)
+PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, src_8888_0565,
+ uint32_t, 1, uint16_t, 1)
+PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, src_0565_8888,
+ uint16_t, 1, uint32_t, 1)
+PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (DO_FAST_MEMCPY, src_0565_0565,
+ uint16_t, 1, uint16_t, 1)
+PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (DO_FAST_MEMCPY, src_8888_8888,
+ uint32_t, 1, uint32_t, 1)
+PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (DO_FAST_MEMCPY, src_0888_0888,
+ uint8_t, 3, uint8_t, 3)
+
static const pixman_fast_path_t mips_dspr2_fast_paths[] =
{
+ PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, mips_composite_src_0565_0565),
+ PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, mips_composite_src_0565_0565),
+ PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, r5g6b5, mips_composite_src_8888_0565),
+ PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, r5g6b5, mips_composite_src_8888_0565),
+ PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, b5g6r5, mips_composite_src_8888_0565),
+ PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, b5g6r5, mips_composite_src_8888_0565),
+ PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, a8r8g8b8, mips_composite_src_0565_8888),
+ PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, x8r8g8b8, mips_composite_src_0565_8888),
+ PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, a8b8g8r8, mips_composite_src_0565_8888),
+ PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, x8b8g8r8, mips_composite_src_0565_8888),
+ PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, mips_composite_src_8888_8888),
+ PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, mips_composite_src_8888_8888),
+ PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, mips_composite_src_8888_8888),
+ PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, mips_composite_src_8888_8888),
+ PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, mips_composite_src_8888_8888),
+ PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, mips_composite_src_8888_8888),
+ PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, mips_composite_src_x888_8888),
+ PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, mips_composite_src_x888_8888),
+ PIXMAN_STD_FAST_PATH (SRC, r8g8b8, null, r8g8b8, mips_composite_src_0888_0888),
+
{ PIXMAN_OP_NONE },
};
diff --git a/pixman/pixman-mips-dspr2.h b/pixman/pixman-mips-dspr2.h
index 4c764a8..449c42a 100644
--- a/pixman/pixman-mips-dspr2.h
+++ b/pixman/pixman-mips-dspr2.h
@@ -35,4 +35,50 @@
#include "pixman-private.h"
#include "pixman-inlines.h"
+#define SKIP_ZERO_SRC 1
+#define SKIP_ZERO_MASK 2
+#define DO_FAST_MEMCPY 3
+
+void
+pixman_mips_fast_memcpy (void *dst, void *src, uint32_t n_bytes);
+
+/****************************************************************/
+
+#define PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST(flags, name, \
+ src_type, src_cnt, \
+ dst_type, dst_cnt) \
+void \
+pixman_composite_##name##_asm_mips (dst_type *dst, \
+ src_type *src, \
+ int32_t w); \
+ \
+static void \
+mips_composite_##name (pixman_implementation_t *imp, \
+ pixman_composite_info_t *info) \
+{ \
+ PIXMAN_COMPOSITE_ARGS (info); \
+ dst_type *dst_line, *dst; \
+ src_type *src_line, *src; \
+ int32_t dst_stride, src_stride; \
+ int bpp = PIXMAN_FORMAT_BPP (dest_image->bits.format) / 8; \
+ \
+ PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, src_type, \
+ src_stride, src_line, src_cnt); \
+ PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, dst_type, \
+ dst_stride, dst_line, dst_cnt); \
+ \
+ while (height--) \
+ { \
+ dst = dst_line; \
+ dst_line += dst_stride; \
+ src = src_line; \
+ src_line += src_stride; \
+ \
+ if (flags == DO_FAST_MEMCPY) \
+ pixman_mips_fast_memcpy (dst, src, width * bpp); \
+ else \
+ pixman_composite_##name##_asm_mips (dst, src, width); \
+ } \
+}
+
#endif //PIXMAN_MIPS_DSPR2_H
diff --git a/pixman/pixman-mips-memcpy-asm.S b/pixman/pixman-mips-memcpy-asm.S
new file mode 100644
index 0000000..6d21675
--- /dev/null
+++ b/pixman/pixman-mips-memcpy-asm.S
@@ -0,0 +1,388 @@
+/*
+ * Copyright (c) 2011
+ * MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "pixman-mips-dspr2-asm.h"
+
+/*
+ * This routine could be optimized for MIPS64. The current code only
+ * uses MIPS32 instructions.
+ */
+
+#ifdef EB
+# define LWHI lwl /* high part is left in big-endian */
+# define SWHI swl /* high part is left in big-endian */
+# define LWLO lwr /* low part is right in big-endian */
+# define SWLO swr /* low part is right in big-endian */
+#else
+# define LWHI lwr /* high part is right in little-endian */
+# define SWHI swr /* high part is right in little-endian */
+# define LWLO lwl /* low part is left in big-endian */
+# define SWLO swl /* low part is left in big-endian */
+#endif
+
+LEAF(pixman_mips_fast_memcpy)
+
+ .set noreorder
+ .set noat
+
+ slti AT, a2, 8
+ bne AT, zero, $last8
+ move v0, a0 /* memcpy returns the dst pointer */
+
+/* Test if the src and dst are word-aligned, or can be made word-aligned */
+ xor t8, a1, a0
+ andi t8, t8, 0x3 /* t8 is a0/a1 word-displacement */
+
+ bne t8, zero, $unaligned
+ negu a3, a0
+
+ andi a3, a3, 0x3 /* we need to copy a3 bytes to make a0/a1 aligned */
+ beq a3, zero, $chk16w /* when a3=0 then the dst (a0) is word-aligned */
+ subu a2, a2, a3 /* now a2 is the remining bytes count */
+
+ LWHI t8, 0(a1)
+ addu a1, a1, a3
+ SWHI t8, 0(a0)
+ addu a0, a0, a3
+
+/* Now the dst/src are mutually word-aligned with word-aligned addresses */
+$chk16w: andi t8, a2, 0x3f /* any whole 64-byte chunks? */
+ /* t8 is the byte count after 64-byte chunks */
+
+ beq a2, t8, $chk8w /* if a2==t8, no 64-byte chunks */
+ /* There will be at most 1 32-byte chunk after it */
+ subu a3, a2, t8 /* subtract from a2 the reminder */
+ /* Here a3 counts bytes in 16w chunks */
+ addu a3, a0, a3 /* Now a3 is the final dst after 64-byte chunks */
+
+ addu t0, a0, a2 /* t0 is the "past the end" address */
+
+/*
+ * When in the loop we exercise "pref 30, x(a0)", the a0+x should not be past
+ * the "t0-32" address
+ * This means: for x=128 the last "safe" a0 address is "t0-160"
+ * Alternatively, for x=64 the last "safe" a0 address is "t0-96"
+ * In the current version we use "pref 30, 128(a0)", so "t0-160" is the limit
+ */
+ subu t9, t0, 160 /* t9 is the "last safe pref 30, 128(a0)" address */
+
+ pref 0, 0(a1) /* bring the first line of src, addr 0 */
+ pref 0, 32(a1) /* bring the second line of src, addr 32 */
+ pref 0, 64(a1) /* bring the third line of src, addr 64 */
+ pref 30, 32(a0) /* safe, as we have at least 64 bytes ahead */
+/* In case the a0 > t9 don't use "pref 30" at all */
+ sgtu v1, a0, t9
+ bgtz v1, $loop16w /* skip "pref 30, 64(a0)" for too short arrays */
+ nop
+/* otherwise, start with using pref30 */
+ pref 30, 64(a0)
+$loop16w:
+ pref 0, 96(a1)
+ lw t0, 0(a1)
+ bgtz v1, $skip_pref30_96 /* skip "pref 30, 96(a0)" */
+ lw t1, 4(a1)
+ pref 30, 96(a0) /* continue setting up the dest, addr 96 */
+$skip_pref30_96:
+ lw t2, 8(a1)
+ lw t3, 12(a1)
+ lw t4, 16(a1)
+ lw t5, 20(a1)
+ lw t6, 24(a1)
+ lw t7, 28(a1)
+ pref 0, 128(a1) /* bring the next lines of src, addr 128 */
+
+ sw t0, 0(a0)
+ sw t1, 4(a0)
+ sw t2, 8(a0)
+ sw t3, 12(a0)
+ sw t4, 16(a0)
+ sw t5, 20(a0)
+ sw t6, 24(a0)
+ sw t7, 28(a0)
+
+ lw t0, 32(a1)
+ bgtz v1, $skip_pref30_128 /* skip "pref 30, 128(a0)" */
+ lw t1, 36(a1)
+ pref 30, 128(a0) /* continue setting up the dest, addr 128 */
+$skip_pref30_128:
+ lw t2, 40(a1)
+ lw t3, 44(a1)
+ lw t4, 48(a1)
+ lw t5, 52(a1)
+ lw t6, 56(a1)
+ lw t7, 60(a1)
+ pref 0, 160(a1) /* bring the next lines of src, addr 160 */
+
+ sw t0, 32(a0)
+ sw t1, 36(a0)
+ sw t2, 40(a0)
+ sw t3, 44(a0)
+ sw t4, 48(a0)
+ sw t5, 52(a0)
+ sw t6, 56(a0)
+ sw t7, 60(a0)
+
+ addiu a0, a0, 64 /* adding 64 to dest */
+ sgtu v1, a0, t9
+ bne a0, a3, $loop16w
+ addiu a1, a1, 64 /* adding 64 to src */
+ move a2, t8
+
+/* Here we have src and dest word-aligned but less than 64-bytes to go */
+
+$chk8w:
+ pref 0, 0x0(a1)
+ andi t8, a2, 0x1f /* is there a 32-byte chunk? */
+ /* the t8 is the reminder count past 32-bytes */
+ beq a2, t8, $chk1w /* when a2=t8, no 32-byte chunk */
+ nop
+
+ lw t0, 0(a1)
+ lw t1, 4(a1)
+ lw t2, 8(a1)
+ lw t3, 12(a1)
+ lw t4, 16(a1)
+ lw t5, 20(a1)
+ lw t6, 24(a1)
+ lw t7, 28(a1)
+ addiu a1, a1, 32
+
+ sw t0, 0(a0)
+ sw t1, 4(a0)
+ sw t2, 8(a0)
+ sw t3, 12(a0)
+ sw t4, 16(a0)
+ sw t5, 20(a0)
+ sw t6, 24(a0)
+ sw t7, 28(a0)
+ addiu a0, a0, 32
+
+$chk1w:
+ andi a2, t8, 0x3 /* now a2 is the reminder past 1w chunks */
+ beq a2, t8, $last8
+ subu a3, t8, a2 /* a3 is count of bytes in 1w chunks */
+ addu a3, a0, a3 /* now a3 is the dst address past the 1w chunks */
+
+/* copying in words (4-byte chunks) */
+$wordCopy_loop:
+ lw t3, 0(a1) /* the first t3 may be equal t0 ... optimize? */
+ addiu a1, a1, 4
+ addiu a0, a0, 4
+ bne a0, a3, $wordCopy_loop
+ sw t3, -4(a0)
+
+/* For the last (<8) bytes */
+$last8:
+ blez a2, leave
+ addu a3, a0, a2 /* a3 is the last dst address */
+$last8loop:
+ lb v1, 0(a1)
+ addiu a1, a1, 1
+ addiu a0, a0, 1
+ bne a0, a3, $last8loop
+ sb v1, -1(a0)
+
+leave: j ra
+ nop
+
+/*
+ * UNALIGNED case
+ */
+
+$unaligned:
+ /* got here with a3="negu a0" */
+ andi a3, a3, 0x3 /* test if the a0 is word aligned */
+ beqz a3, $ua_chk16w
+ subu a2, a2, a3 /* bytes left after initial a3 bytes */
+
+ LWHI v1, 0(a1)
+ LWLO v1, 3(a1)
+ addu a1, a1, a3 /* a3 may be here 1, 2 or 3 */
+ SWHI v1, 0(a0)
+ addu a0, a0, a3 /* below the dst will be word aligned (NOTE1) */
+
+$ua_chk16w: andi t8, a2, 0x3f /* any whole 64-byte chunks? */
+ /* t8 is the byte count after 64-byte chunks */
+ beq a2, t8, $ua_chk8w /* if a2==t8, no 64-byte chunks */
+ /* There will be at most 1 32-byte chunk after it */
+ subu a3, a2, t8 /* subtract from a2 the reminder */
+ /* Here a3 counts bytes in 16w chunks */
+ addu a3, a0, a3 /* Now a3 is the final dst after 64-byte chunks */
+
+ addu t0, a0, a2 /* t0 is the "past the end" address */
+
+ subu t9, t0, 160 /* t9 is the "last safe pref 30, 128(a0)" address */
+
+ pref 0, 0(a1) /* bring the first line of src, addr 0 */
+ pref 0, 32(a1) /* bring the second line of src, addr 32 */
+ pref 0, 64(a1) /* bring the third line of src, addr 64 */
+ pref 30, 32(a0) /* safe, as we have at least 64 bytes ahead */
+/* In case the a0 > t9 don't use "pref 30" at all */
+ sgtu v1, a0, t9
+ bgtz v1, $ua_loop16w /* skip "pref 30, 64(a0)" for too short arrays */
+ nop
+/* otherwise, start with using pref30 */
+ pref 30, 64(a0)
+$ua_loop16w:
+ pref 0, 96(a1)
+ LWHI t0, 0(a1)
+ LWLO t0, 3(a1)
+ LWHI t1, 4(a1)
+ bgtz v1, $ua_skip_pref30_96
+ LWLO t1, 7(a1)
+ pref 30, 96(a0) /* continue setting up the dest, addr 96 */
+$ua_skip_pref30_96:
+ LWHI t2, 8(a1)
+ LWLO t2, 11(a1)
+ LWHI t3, 12(a1)
+ LWLO t3, 15(a1)
+ LWHI t4, 16(a1)
+ LWLO t4, 19(a1)
+ LWHI t5, 20(a1)
+ LWLO t5, 23(a1)
+ LWHI t6, 24(a1)
+ LWLO t6, 27(a1)
+ LWHI t7, 28(a1)
+ LWLO t7, 31(a1)
+ pref 0, 128(a1) /* bring the next lines of src, addr 128 */
+
+ sw t0, 0(a0)
+ sw t1, 4(a0)
+ sw t2, 8(a0)
+ sw t3, 12(a0)
+ sw t4, 16(a0)
+ sw t5, 20(a0)
+ sw t6, 24(a0)
+ sw t7, 28(a0)
+
+ LWHI t0, 32(a1)
+ LWLO t0, 35(a1)
+ LWHI t1, 36(a1)
+ bgtz v1, $ua_skip_pref30_128
+ LWLO t1, 39(a1)
+ pref 30, 128(a0) /* continue setting up the dest, addr 128 */
+$ua_skip_pref30_128:
+ LWHI t2, 40(a1)
+ LWLO t2, 43(a1)
+ LWHI t3, 44(a1)
+ LWLO t3, 47(a1)
+ LWHI t4, 48(a1)
+ LWLO t4, 51(a1)
+ LWHI t5, 52(a1)
+ LWLO t5, 55(a1)
+ LWHI t6, 56(a1)
+ LWLO t6, 59(a1)
+ LWHI t7, 60(a1)
+ LWLO t7, 63(a1)
+ pref 0, 160(a1) /* bring the next lines of src, addr 160 */
+
+ sw t0, 32(a0)
+ sw t1, 36(a0)
+ sw t2, 40(a0)
+ sw t3, 44(a0)
+ sw t4, 48(a0)
+ sw t5, 52(a0)
+ sw t6, 56(a0)
+ sw t7, 60(a0)
+
+ addiu a0, a0, 64 /* adding 64 to dest */
+ sgtu v1, a0, t9
+ bne a0, a3, $ua_loop16w
+ addiu a1, a1, 64 /* adding 64 to src */
+ move a2, t8
+
+/* Here we have src and dest word-aligned but less than 64-bytes to go */
+
+$ua_chk8w:
+ pref 0, 0x0(a1)
+ andi t8, a2, 0x1f /* is there a 32-byte chunk? */
+ /* the t8 is the reminder count */
+ beq a2, t8, $ua_chk1w /* when a2=t8, no 32-byte chunk */
+
+ LWHI t0, 0(a1)
+ LWLO t0, 3(a1)
+ LWHI t1, 4(a1)
+ LWLO t1, 7(a1)
+ LWHI t2, 8(a1)
+ LWLO t2, 11(a1)
+ LWHI t3, 12(a1)
+ LWLO t3, 15(a1)
+ LWHI t4, 16(a1)
+ LWLO t4, 19(a1)
+ LWHI t5, 20(a1)
+ LWLO t5, 23(a1)
+ LWHI t6, 24(a1)
+ LWLO t6, 27(a1)
+ LWHI t7, 28(a1)
+ LWLO t7, 31(a1)
+ addiu a1, a1, 32
+
+ sw t0, 0(a0)
+ sw t1, 4(a0)
+ sw t2, 8(a0)
+ sw t3, 12(a0)
+ sw t4, 16(a0)
+ sw t5, 20(a0)
+ sw t6, 24(a0)
+ sw t7, 28(a0)
+ addiu a0, a0, 32
+
+$ua_chk1w:
+ andi a2, t8, 0x3 /* now a2 is the reminder past 1w chunks */
+ beq a2, t8, $ua_smallCopy
+ subu a3, t8, a2 /* a3 is count of bytes in 1w chunks */
+ addu a3, a0, a3 /* now a3 is the dst address past the 1w chunks */
+
+/* copying in words (4-byte chunks) */
+$ua_wordCopy_loop:
+ LWHI v1, 0(a1)
+ LWLO v1, 3(a1)
+ addiu a1, a1, 4
+ addiu a0, a0, 4 /* note: dst=a0 is word aligned here, see NOTE1 */
+ bne a0, a3, $ua_wordCopy_loop
+ sw v1, -4(a0)
+
+/* Now less than 4 bytes (value in a2) left to copy */
+$ua_smallCopy:
+ beqz a2, leave
+ addu a3, a0, a2 /* a3 is the last dst address */
+$ua_smallCopy_loop:
+ lb v1, 0(a1)
+ addiu a1, a1, 1
+ addiu a0, a0, 1
+ bne a0, a3, $ua_smallCopy_loop
+ sb v1, -1(a0)
+
+ j ra
+ nop
+
+ .set at
+ .set reorder
+
+END(pixman_mips_fast_memcpy)
--
1.7.3
More information about the Pixman
mailing list