Hi everyone,<div><br></div><div>I wrote some NEON codes for bilinear scaled 'over 8888 8888' extending previous patches from siarhei siamashka.</div><div>I put some combining operations just before storing interpolated pixels into the destination buffer.</div>
<div>It has passed my several naive test cases, but It would be very appreciated if any one can review my implementation.</div><div>And please let me know where I can find the Microbenchmark for performance measurement.</div>
<div><br></div><div>Thanks in advance.</div><div><br></div><div><div><span style="font-family:monospace;white-space:pre-wrap;font-size:medium">---</span></div></div><div><div> pixman/pixman-arm-neon-asm.S | 96 +++++++++++++++++++++++++++++++++++++-----</div>
<div> pixman/pixman-arm-neon.c | 4 ++</div><div> 2 files changed, 89 insertions(+), 11 deletions(-)</div></div><div><br></div><div><div>diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S</div>
<div>index 71b30ac..e178a75 100644</div><div>--- a/pixman/pixman-arm-neon-asm.S</div><div>+++ b/pixman/pixman-arm-neon-asm.S</div><div>@@ -2554,7 +2554,74 @@ fname:</div><div> .endif</div><div> .endm</div><div> </div><div>
-.macro bilinear_interpolate_last_pixel src_fmt, dst_fmt</div><div>+/*</div><div>+ * Combine functions are called just before writing results to memory.</div><div>+ * Assume that source pixels are located in d0, d1 registers</div>
<div>+ * in a8r8g8b8 format.</div><div>+ * Combine functions may use registers d2~d31 and overwrite result</div><div>+ * on d0, d1 registers.</div><div>+ * TODO: 0565 format, optimization for 2 and 1 pixel case</div><div>
+ * TODO: Fix hard-coded prefetch distance</div><div>+ */</div><div>+</div><div>+/* Dummy combine function for operator SRC */</div><div>+.macro bilinear_combine_src dst_fmt, numpix</div><div>+.endm</div><div>+</div><div>
+/* Destination pixel load functions for bilinear_combine_XXXX */</div><div>+.macro bilinear_load_dst_8888 numpix</div><div>+.if numpix == 4</div><div>+<span style="white-space:pre-wrap">        </span>vld1.32<span style="white-space:pre-wrap">                </span>{d2, d3}, [OUT]</div>
<div>+<span style="white-space:pre-wrap">        </span>pld<span style="white-space:pre-wrap">                        </span>[OUT, #16]</div><div>+.elseif numpix == 2</div><div>+<span style="white-space:pre-wrap">        </span>vld1.32<span style="white-space:pre-wrap">                </span>{d2}, [OUT]</div>
<div>+.elseif numpix == 1</div><div>+<span style="white-space:pre-wrap">        </span>vld1.32<span style="white-space:pre-wrap">                </span>{d2[0]}, [OUT]</div><div>+.else</div><div>+<span style="white-space:pre-wrap">        </span>.error bilinear_load_dst_8888 numpix is unsupported</div>
<div>+.endif</div><div>+.endm</div><div>+</div><div>+.macro bilinear_load_dst_0565 numpix</div><div>+.if numpix == 4</div><div>+.elseif numpix == 2</div><div>+.elseif numpix == 1</div><div>+.else</div><div>+<span style="white-space:pre-wrap">        </span>.error bilinear_load_dst_0565 numpix is unsupported</div>
<div>+.endif</div><div>+.endm</div><div>+</div><div>+/* Combine function for operator OVER */</div><div>+.macro bilinear_combine_over dst_fmt, numpix</div><div>+<span style="white-space:pre-wrap">        </span>bilinear_load_dst_&dst_fmt numpix</div>
<div>+<span style="white-space:pre-wrap">        </span>/* Deinterleave source & destination */</div><div>+<span style="white-space:pre-wrap">        </span>vuzp.8<span style="white-space:pre-wrap">                </span>d0, d1</div>
<div>+<span style="white-space:pre-wrap">        </span>vuzp.8<span style="white-space:pre-wrap">                </span>d0, d1</div><div>+<span style="white-space:pre-wrap">        </span>vuzp.8<span style="white-space:pre-wrap">                </span>d2, d3</div>
<div>+<span style="white-space:pre-wrap">        </span>vuzp.8<span style="white-space:pre-wrap">                </span>d2, d3</div><div>+<span style="white-space:pre-wrap">        </span></div><div>
+<span style="white-space:pre-wrap">        </span>/* invert source alpha */</div><div>+<span style="white-space:pre-wrap">        </span>vdup.32<span style="white-space:pre-wrap">                </span>d4, d1[1]</div>
<div>+<span style="white-space:pre-wrap">        </span>vmvn.8<span style="white-space:pre-wrap">                </span>d4, d4</div><div>+</div><div>+<span style="white-space:pre-wrap">        </span>/* result = dst*(256 - srcA) */</div>
<div>+<span style="white-space:pre-wrap">        </span>vmull.u8<span style="white-space:pre-wrap">        </span>q3, d2, d4</div><div>+<span style="white-space:pre-wrap">        </span>vmull.u8<span style="white-space:pre-wrap">        </span>q4, d3, d4</div>
<div>+</div><div>+<span style="white-space:pre-wrap">        </span>vrshr.u16<span style="white-space:pre-wrap">        </span>q5, q3, #8</div><div>+<span style="white-space:pre-wrap">        </span>vrshr.u16<span style="white-space:pre-wrap">        </span>q6, q4, #8</div>
<div>+<span style="white-space:pre-wrap">        </span>vraddhn.u16<span style="white-space:pre-wrap">        </span>d14, q5, q3</div><div>+<span style="white-space:pre-wrap">        </span>vraddhn.u16<span style="white-space:pre-wrap">        </span>d15, q6, q4</div>
<div>+</div><div>+<span style="white-space:pre-wrap">        </span>/* result += src (premultiplied) */</div><div>+<span style="white-space:pre-wrap">        </span>vqadd.u8<span style="white-space:pre-wrap">        </span>q0, q7, q0</div>
<div>+</div><div>+<span style="white-space:pre-wrap">        </span>/* Interleave (rrrr, gggg, bbbb, aaaa) into (rgba, rgba, rgba, rgba) */</div><div>+<span style="white-space:pre-wrap">        </span>vuzp.8<span style="white-space:pre-wrap">                </span>d0, d1</div>
<div>+<span style="white-space:pre-wrap">        </span>vuzp.8<span style="white-space:pre-wrap">                </span>d0, d1</div><div>+.endm</div><div>+</div><div>+.macro bilinear_interpolate_last_pixel src_fmt, dst_fmt, op</div>
<div> bilinear_load_&src_fmt d0, d1, d2</div><div> vmull.u8 q1, d0, d28</div><div> vmlal.u8 q1, d1, d29</div><div>@@ -2568,10 +2635,11 @@ fname:</div><div> /* 3 cycles bubble */</div><div> vmovn.u16 d0, q0</div>
<div> /* 1 cycle bubble */</div><div>+<span style="white-space:pre-wrap">        </span>bilinear_combine_&op dst_fmt, 1</div><div> bilinear_store_&dst_fmt 1, q2, q3</div><div> .endm</div><div> </div>
<div>-.macro bilinear_interpolate_two_pixels src_fmt, dst_fmt</div><div>+.macro bilinear_interpolate_two_pixels src_fmt, dst_fmt, op</div><div> bilinear_load_and_vertical_interpolate_two_&src_fmt \</div><div> q1, q11, d0, d1, d20, d21, d22, d23</div>
<div> vshr.u16 q15, q12, #8</div><div>@@ -2585,10 +2653,11 @@ fname:</div><div> vshrn.u32 d30, q0, #16</div><div> vshrn.u32 d31, q10, #16</div><div> vmovn.u16 d0, q15</div><div>+<span style="white-space:pre-wrap">        </span>bilinear_combine_&op dst_fmt, 2</div>
<div> bilinear_store_&dst_fmt 2, q2, q3</div><div> .endm</div><div> </div><div>-.macro bilinear_interpolate_four_pixels src_fmt, dst_fmt</div><div>+.macro bilinear_interpolate_four_pixels src_fmt, dst_fmt, op</div>
<div> bilinear_load_and_vertical_interpolate_four_&src_fmt \</div><div> q1, q11, d0, d1, d20, d21, d22, d23 \</div><div> q3, q9, d4, d5, d16, d17, d18, d19</div><div>@@ -2616,6 +2685,7 @@ fname:</div>
<div> vshrn.u32 d5, q8, #16</div><div> vmovn.u16 d0, q0</div><div> vmovn.u16 d1, q2</div><div>+<span style="white-space:pre-wrap">        </span>bilinear_combine_&op dst_fmt, 4</div><div> bilinear_store_&dst_fmt 4, q2, q3</div>
<div> .endm</div><div> </div><div>@@ -2635,7 +2705,7 @@ fname:</div><div> * pixels ahead</div><div> */</div><div> </div><div>-.macro generate_bilinear_scanline_func fname, src_fmt, dst_fmt, \</div>
<div>
+.macro generate_bilinear_scanline_func fname, src_fmt, dst_fmt, op, \</div><div> bpp_shift, prefetch_distance</div><div> </div><div> pixman_asm_function fname</div><div>@@ -2673,17 +2743,17 @@ pixman_asm_function fname</div>
<div> blt 1f</div><div> mov PF_OFFS, PF_OFFS, asr #(16 - bpp_shift)</div><div> 0:</div><div>- bilinear_interpolate_four_pixels src_fmt, dst_fmt</div><div>+ bilinear_interpolate_four_pixels src_fmt, dst_fmt, op</div>
<div> subs WIDTH, WIDTH, #4</div><div> bge 0b</div><div> 1:</div><div> tst WIDTH, #2</div><div> beq 2f</div><div>- bilinear_interpolate_two_pixels src_fmt, dst_fmt</div><div>+ bilinear_interpolate_two_pixels src_fmt, dst_fmt, op</div>
<div> 2:</div><div> tst WIDTH, #1</div><div> beq 3f</div><div>- bilinear_interpolate_last_pixel src_fmt, dst_fmt</div><div>+ bilinear_interpolate_last_pixel src_fmt, dst_fmt, op</div><div> 3:</div>
<div> pop {r4, r5, r6, r7, r8, r9}</div><div> bx lr</div><div>@@ -2706,13 +2776,17 @@ pixman_asm_function fname</div><div> .endm</div><div> </div><div> generate_bilinear_scanline_func \</div><div>- pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon, 8888, 8888, 2, 28</div>
<div>+ pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon, 8888, 8888, src, 2, 28</div><div> </div><div> generate_bilinear_scanline_func \</div><div>- pixman_scaled_bilinear_scanline_8888_0565_SRC_asm_neon, 8888, 0565, 2, 28</div>
<div>+ pixman_scaled_bilinear_scanline_8888_0565_SRC_asm_neon, 8888, 0565, src, 2, 28</div><div> </div><div> generate_bilinear_scanline_func \</div><div>- pixman_scaled_bilinear_scanline_0565_x888_SRC_asm_neon, 0565, 8888, 1, 28</div>
<div>+ pixman_scaled_bilinear_scanline_0565_x888_SRC_asm_neon, 0565, 8888, src, 1, 28</div><div> </div><div> generate_bilinear_scanline_func \</div><div>- pixman_scaled_bilinear_scanline_0565_0565_SRC_asm_neon, 0565, 0565, 1, 28</div>
<div>+ pixman_scaled_bilinear_scanline_0565_0565_SRC_asm_neon, 0565, 0565, src, 1, 28</div><div>+</div><div>+generate_bilinear_scanline_func \</div><div>+<span style="white-space:pre-wrap">        </span>pixman_scaled_bilinear_scanline_8888_8888_OVER_asm_neon, 8888, 8888, over, 2, 28</div>
<div>+</div><div>diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c</div><div>index 0a10ca1..7042ce1 100644</div><div>--- a/pixman/pixman-arm-neon.c</div><div>+++ b/pixman/pixman-arm-neon.c</div><div>@@ -136,6 +136,8 @@ PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (0, neon, 0565_x888, SRC,</div>
<div> PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (0, neon, 0565_0565, SRC,</div><div> uint16_t, uint16_t)</div><div> </div><div>+PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST(0, neon, 8888_8888, OVER,</div>
<div>+<span style="white-space:pre-wrap">                                                                                </span>uint32_t, uint32_t)</div><div> void</div><div> pixman_composite_src_n_8_asm_neon (int32_t w,</div><div> int32_t h,</div>
<div>@@ -362,6 +364,8 @@ static const pixman_fast_path_t arm_neon_fast_paths[] =</div><div> SIMPLE_BILINEAR_FAST_PATH (SRC, r5g6b5, x8r8g8b8, neon_0565_x888),</div><div> SIMPLE_BILINEAR_FAST_PATH (SRC, r5g6b5, r5g6b5, neon_0565_0565),</div>
<div> </div><div>+<span style="white-space:pre-wrap">        </span>SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, neon_8888_8888),</div><div>+</div><div> { PIXMAN_OP_NONE },</div><div> };</div><div> </div>
<div><br></div>-- <br>Best Regards,<div>Taekyun Kim</div><br>
</div>