Hi everyone,<div><br></div><div>I wrote some NEON codes for bilinear scaled &#39;over 8888 8888&#39; extending previous patches from siarhei siamashka.</div><div>I put some combining operations just before storing interpolated pixels into the destination buffer.</div>

<div>It has passed my several naive test cases, but It would be very appreciated if any one can review my implementation.</div><div>And please let me know where I can find the Microbenchmark for performance measurement.</div>

<div><br></div><div>Thanks in advance.</div><div><br></div><div><div><span style="font-family:monospace;white-space:pre-wrap;font-size:medium">---</span></div></div><div><div> pixman/pixman-arm-neon-asm.S |   96 +++++++++++++++++++++++++++++++++++++-----</div>

<div> pixman/pixman-arm-neon.c     |    4 ++</div><div> 2 files changed, 89 insertions(+), 11 deletions(-)</div></div><div><br></div><div><div>diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S</div>

<div>index 71b30ac..e178a75 100644</div><div>--- a/pixman/pixman-arm-neon-asm.S</div><div>+++ b/pixman/pixman-arm-neon-asm.S</div><div>@@ -2554,7 +2554,74 @@ fname:</div><div> .endif</div><div> .endm</div><div> </div><div>

-.macro bilinear_interpolate_last_pixel src_fmt, dst_fmt</div><div>+/*</div><div>+ * Combine functions are called just before writing results to memory.</div><div>+ * Assume that source pixels are located in d0, d1 registers</div>

<div>+ * in a8r8g8b8 format.</div><div>+ * Combine functions may use registers d2~d31 and overwrite result</div><div>+ * on d0, d1 registers.</div><div>+ * TODO: 0565 format, optimization for 2 and 1 pixel case</div><div>

+ * TODO: Fix hard-coded prefetch distance</div><div>+ */</div><div>+</div><div>+/* Dummy combine function for operator SRC */</div><div>+.macro bilinear_combine_src dst_fmt, numpix</div><div>+.endm</div><div>+</div><div>

+/* Destination pixel load functions for bilinear_combine_XXXX */</div><div>+.macro bilinear_load_dst_8888 numpix</div><div>+.if numpix == 4</div><div>+<span style="white-space:pre-wrap">        </span>vld1.32<span style="white-space:pre-wrap">                </span>{d2, d3}, [OUT]</div>

<div>+<span style="white-space:pre-wrap">        </span>pld<span style="white-space:pre-wrap">                        </span>[OUT, #16]</div><div>+.elseif numpix == 2</div><div>+<span style="white-space:pre-wrap">        </span>vld1.32<span style="white-space:pre-wrap">                </span>{d2}, [OUT]</div>

<div>+.elseif numpix == 1</div><div>+<span style="white-space:pre-wrap">        </span>vld1.32<span style="white-space:pre-wrap">                </span>{d2[0]}, [OUT]</div><div>+.else</div><div>+<span style="white-space:pre-wrap">        </span>.error bilinear_load_dst_8888 numpix is unsupported</div>

<div>+.endif</div><div>+.endm</div><div>+</div><div>+.macro bilinear_load_dst_0565 numpix</div><div>+.if numpix == 4</div><div>+.elseif numpix == 2</div><div>+.elseif numpix == 1</div><div>+.else</div><div>+<span style="white-space:pre-wrap">        </span>.error bilinear_load_dst_0565 numpix is unsupported</div>

<div>+.endif</div><div>+.endm</div><div>+</div><div>+/* Combine function for operator OVER */</div><div>+.macro bilinear_combine_over dst_fmt, numpix</div><div>+<span style="white-space:pre-wrap">        </span>bilinear_load_dst_&amp;dst_fmt numpix</div>

<div>+<span style="white-space:pre-wrap">        </span>/* Deinterleave source &amp; destination */</div><div>+<span style="white-space:pre-wrap">        </span>vuzp.8<span style="white-space:pre-wrap">                </span>d0, d1</div>
<div>+<span style="white-space:pre-wrap">        </span>vuzp.8<span style="white-space:pre-wrap">                </span>d0, d1</div><div>+<span style="white-space:pre-wrap">        </span>vuzp.8<span style="white-space:pre-wrap">                </span>d2, d3</div>

<div>+<span style="white-space:pre-wrap">        </span>vuzp.8<span style="white-space:pre-wrap">                </span>d2, d3</div><div>+<span style="white-space:pre-wrap">        </span></div><div>
+<span style="white-space:pre-wrap">        </span>/* invert source alpha */</div><div>+<span style="white-space:pre-wrap">        </span>vdup.32<span style="white-space:pre-wrap">                </span>d4, d1[1]</div>
<div>+<span style="white-space:pre-wrap">        </span>vmvn.8<span style="white-space:pre-wrap">                </span>d4, d4</div><div>+</div><div>+<span style="white-space:pre-wrap">        </span>/* result = dst*(256 - srcA) */</div>
<div>+<span style="white-space:pre-wrap">        </span>vmull.u8<span style="white-space:pre-wrap">        </span>q3, d2, d4</div><div>+<span style="white-space:pre-wrap">        </span>vmull.u8<span style="white-space:pre-wrap">        </span>q4, d3, d4</div>

<div>+</div><div>+<span style="white-space:pre-wrap">        </span>vrshr.u16<span style="white-space:pre-wrap">        </span>q5, q3, #8</div><div>+<span style="white-space:pre-wrap">        </span>vrshr.u16<span style="white-space:pre-wrap">        </span>q6, q4, #8</div>

<div>+<span style="white-space:pre-wrap">        </span>vraddhn.u16<span style="white-space:pre-wrap">        </span>d14, q5, q3</div><div>+<span style="white-space:pre-wrap">        </span>vraddhn.u16<span style="white-space:pre-wrap">        </span>d15, q6, q4</div>

<div>+</div><div>+<span style="white-space:pre-wrap">        </span>/* result += src (premultiplied) */</div><div>+<span style="white-space:pre-wrap">        </span>vqadd.u8<span style="white-space:pre-wrap">        </span>q0, q7, q0</div>
<div>+</div><div>+<span style="white-space:pre-wrap">        </span>/* Interleave (rrrr, gggg, bbbb, aaaa) into (rgba, rgba, rgba, rgba) */</div><div>+<span style="white-space:pre-wrap">        </span>vuzp.8<span style="white-space:pre-wrap">                </span>d0, d1</div>

<div>+<span style="white-space:pre-wrap">        </span>vuzp.8<span style="white-space:pre-wrap">                </span>d0, d1</div><div>+.endm</div><div>+</div><div>+.macro bilinear_interpolate_last_pixel src_fmt, dst_fmt, op</div>
<div>     bilinear_load_&amp;src_fmt d0, d1, d2</div><div>     vmull.u8  q1, d0, d28</div><div>     vmlal.u8  q1, d1, d29</div><div>@@ -2568,10 +2635,11 @@ fname:</div><div>     /* 3 cycles bubble */</div><div>     vmovn.u16 d0, q0</div>

<div>     /* 1 cycle bubble */</div><div>+<span style="white-space:pre-wrap">        </span>bilinear_combine_&amp;op dst_fmt, 1</div><div>     bilinear_store_&amp;dst_fmt 1, q2, q3</div><div> .endm</div><div> </div>
<div>-.macro bilinear_interpolate_two_pixels src_fmt, dst_fmt</div><div>+.macro bilinear_interpolate_two_pixels src_fmt, dst_fmt, op</div><div>     bilinear_load_and_vertical_interpolate_two_&amp;src_fmt \</div><div>                 q1, q11, d0, d1, d20, d21, d22, d23</div>

<div>     vshr.u16  q15, q12, #8</div><div>@@ -2585,10 +2653,11 @@ fname:</div><div>     vshrn.u32 d30, q0, #16</div><div>     vshrn.u32 d31, q10, #16</div><div>     vmovn.u16 d0, q15</div><div>+<span style="white-space:pre-wrap">        </span>bilinear_combine_&amp;op dst_fmt, 2</div>

<div>     bilinear_store_&amp;dst_fmt 2, q2, q3</div><div> .endm</div><div> </div><div>-.macro bilinear_interpolate_four_pixels src_fmt, dst_fmt</div><div>+.macro bilinear_interpolate_four_pixels src_fmt, dst_fmt, op</div>

<div>     bilinear_load_and_vertical_interpolate_four_&amp;src_fmt \</div><div>                 q1, q11, d0, d1, d20, d21, d22, d23 \</div><div>                 q3, q9,  d4, d5, d16, d17, d18, d19</div><div>@@ -2616,6 +2685,7 @@ fname:</div>

<div>     vshrn.u32 d5, q8, #16</div><div>     vmovn.u16 d0, q0</div><div>     vmovn.u16 d1, q2</div><div>+<span style="white-space:pre-wrap">        </span>bilinear_combine_&amp;op dst_fmt, 4</div><div>     bilinear_store_&amp;dst_fmt 4, q2, q3</div>

<div> .endm</div><div> </div><div>@@ -2635,7 +2705,7 @@ fname:</div><div>  *                      pixels ahead</div><div>  */</div><div> </div><div>-.macro generate_bilinear_scanline_func fname, src_fmt, dst_fmt, \</div>
<div>
+.macro generate_bilinear_scanline_func fname, src_fmt, dst_fmt, op, \</div><div>                                        bpp_shift, prefetch_distance</div><div> </div><div> pixman_asm_function fname</div><div>@@ -2673,17 +2743,17 @@ pixman_asm_function fname</div>

<div>     blt       1f</div><div>     mov       PF_OFFS, PF_OFFS, asr #(16 - bpp_shift)</div><div> 0:</div><div>-    bilinear_interpolate_four_pixels src_fmt, dst_fmt</div><div>+    bilinear_interpolate_four_pixels src_fmt, dst_fmt, op</div>

<div>     subs      WIDTH, WIDTH, #4</div><div>     bge       0b</div><div> 1:</div><div>     tst       WIDTH, #2</div><div>     beq       2f</div><div>-    bilinear_interpolate_two_pixels src_fmt, dst_fmt</div><div>+    bilinear_interpolate_two_pixels src_fmt, dst_fmt, op</div>

<div> 2:</div><div>     tst       WIDTH, #1</div><div>     beq       3f</div><div>-    bilinear_interpolate_last_pixel src_fmt, dst_fmt</div><div>+    bilinear_interpolate_last_pixel src_fmt, dst_fmt, op</div><div> 3:</div>

<div>     pop       {r4, r5, r6, r7, r8, r9}</div><div>     bx        lr</div><div>@@ -2706,13 +2776,17 @@ pixman_asm_function fname</div><div> .endm</div><div> </div><div> generate_bilinear_scanline_func \</div><div>-    pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon, 8888, 8888, 2, 28</div>

<div>+    pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon, 8888, 8888, src, 2, 28</div><div> </div><div> generate_bilinear_scanline_func \</div><div>-    pixman_scaled_bilinear_scanline_8888_0565_SRC_asm_neon, 8888, 0565, 2, 28</div>

<div>+    pixman_scaled_bilinear_scanline_8888_0565_SRC_asm_neon, 8888, 0565, src, 2, 28</div><div> </div><div> generate_bilinear_scanline_func \</div><div>-    pixman_scaled_bilinear_scanline_0565_x888_SRC_asm_neon, 0565, 8888, 1, 28</div>

<div>+    pixman_scaled_bilinear_scanline_0565_x888_SRC_asm_neon, 0565, 8888, src, 1, 28</div><div> </div><div> generate_bilinear_scanline_func \</div><div>-    pixman_scaled_bilinear_scanline_0565_0565_SRC_asm_neon, 0565, 0565, 1, 28</div>

<div>+    pixman_scaled_bilinear_scanline_0565_0565_SRC_asm_neon, 0565, 0565, src, 1, 28</div><div>+</div><div>+generate_bilinear_scanline_func \</div><div>+<span style="white-space:pre-wrap">        </span>pixman_scaled_bilinear_scanline_8888_8888_OVER_asm_neon, 8888, 8888, over, 2, 28</div>

<div>+</div><div>diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c</div><div>index 0a10ca1..7042ce1 100644</div><div>--- a/pixman/pixman-arm-neon.c</div><div>+++ b/pixman/pixman-arm-neon.c</div><div>@@ -136,6 +136,8 @@ PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (0, neon, 0565_x888, SRC,</div>

<div> PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (0, neon, 0565_0565, SRC,</div><div>                                          uint16_t, uint16_t)</div><div> </div><div>+PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST(0, neon, 8888_8888, OVER,</div>

<div>+<span style="white-space:pre-wrap">                                                                                </span>uint32_t, uint32_t)</div><div> void</div><div> pixman_composite_src_n_8_asm_neon (int32_t   w,</div><div>                                    int32_t   h,</div>
<div>@@ -362,6 +364,8 @@ static const pixman_fast_path_t arm_neon_fast_paths[] =</div><div>     SIMPLE_BILINEAR_FAST_PATH (SRC, r5g6b5, x8r8g8b8, neon_0565_x888),</div><div>     SIMPLE_BILINEAR_FAST_PATH (SRC, r5g6b5, r5g6b5, neon_0565_0565),</div>

<div> </div><div>+<span style="white-space:pre-wrap">        </span>SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, neon_8888_8888),</div><div>+</div><div>     { PIXMAN_OP_NONE },</div><div> };</div><div> </div>
<div><br></div>-- <br>Best Regards,<div>Taekyun Kim</div><br>
</div>