<div dir="ltr">One good man gave me a  sse3 version of videoconvert_convert_matrix8 (4-7 times faster)<br><br><div><div>Samples: 198K of event 'cycles', Event count (approx.): 112390215455                                                                                                            </div>
<div> 31.71%  libx264.so.142               [.] x264_add8x8_idct_avx2.skip_prologue</div><div> 19.26%  libx264.so.142               [.] 0x00000000000951bc</div><div> 10.58%  libx264.so.142               [.] x264_add8x8_idct_avx.skip_prologue</div>
<div>  3.71%  libgstvideoconvert.so        [.] videoconvert_convert_matrix8         (3.71% vs <span style="font-family:arial,sans-serif;font-size:13px"> 23.51%</span>)</div><div>  2.83%  libx264.so.142               [.] x264_me_search_ref</div>
<div>  2.82%  orcexec.eWcXD2               [.] 0x0000000000000284</div><div>  2.64%  <a href="http://libc-2.18.so">libc-2.18.so</a>                 [.] __memcpy_sse2_unaligned</div><div>  2.24%  libgstvideo-1.0.so.0.204.0   [.] video_chroma_down_v2_guint8</div>
<div>  2.22%  libgstvideo-1.0.so.0.204.0   [.] video_chroma_down_h2_guint8</div><div>  2.20%  libx264.so.142               [.] x264_macroblock_cache_load_progressive</div><div>  1.95%  libx264.so.142               [.] x264_sub8x8_dct_avx.skip_prologue</div>
<div>  1.55%  libx264.so.142               [.] x264_macroblock_analyse</div><div>  0.97%  libx264.so.142               [.] x264_macroblock_encode</div><div>  0.94%  libx264.so.142               [.] x264_macroblock_cache_save</div>
<div>  0.90%  libx264.so.142               [.] x264_mb_predict_mv_direct16x16</div><div>  0.83%  libx264.so.142               [.] x264_mb_predict_mv_ref16x16</div><div>  0.62%  perf                         [.] 0x0000000000067844</div>
<div>  0.59%  libx264.so.142               [.] x264_mb_encode_chroma</div><div>  0.39%  libx264.so.142               [.] x264_macroblock_probe_skip</div></div><div><br></div><div><br></div><div>it is possible add videoconvert_convert_matrix8_sse in upstream?<br>
<br>static void<br>videoconvert_convert_matrix8 (VideoConvert * convert, gpointer pixels)<br>{<br>        int i,j;<br>        guint8 *p = pixels;<br> <br>        __m128i v_byte1 = _mm_set1_epi32(0x000000ff);<br>        __m128i v_byte3 = _mm_set1_epi32(0x00ff0000);<br>
        __m128i v_mat_00 = _mm_set1_epi16((short int)convert->cmatrix[0][0]);<br>        __m128i v_mat_01 = _mm_set1_epi16((short int)convert->cmatrix[0][1]);<br>        __m128i v_mat_02 = _mm_set1_epi16((short int)convert->cmatrix[0][2]);<br>
        __m128i v_mat_03 = _mm_set1_epi16((short int)convert->cmatrix[0][3]);<br>        __m128i v_mat_04 = _mm_set1_epi16((short int)convert->cmatrix[1][0]);<br>        __m128i v_mat_05 = _mm_set1_epi16((short int)convert->cmatrix[1][1]);<br>
        __m128i v_mat_06 = _mm_set1_epi16((short int)convert->cmatrix[1][2]);<br>        __m128i v_mat_07 = _mm_set1_epi16((short int)convert->cmatrix[1][3]);<br>        __m128i v_mat_08 = _mm_set1_epi16((short int)convert->cmatrix[2][0]);<br>
        __m128i v_mat_09 = _mm_set1_epi16((short int)convert->cmatrix[2][1]);<br>        __m128i v_mat_10 = _mm_set1_epi16((short int)convert->cmatrix[2][2]);<br>        __m128i v_mat_11 = _mm_set1_epi16((short int)convert->cmatrix[2][3]);<br>
        <br>        __m128i mask2   = _mm_set1_epi32(0x00ff00ff);<br> <br>        __m128i mask_y1 = _mm_set_epi8((char)128, (char)128, 12, (char)128,   (char)128, (char)128, 8, (char)128,<br>                                        (char)128, (char)128, 4, (char)128,   (char)128, (char)128, 0, (char)128);<br>
 <br>        __m128i mask_y2 = _mm_set_epi8((char)128, (char)128, 14,  (char)128,  (char)128, (char)128, 10, (char)128,<br>                                        (char)128, (char)128, 6, (char)128,   (char)128, (char)128, 2, (char)128);<br>
 <br>        __m128i mask_u1 = _mm_set_epi8((char)128, 12, (char)128, (char)128,   (char)128, 8, (char)128, (char)128,<br>                                        (char)128, 4, (char)128, (char)128,   (char)128, 0, (char)128, (char)128);<br>
 <br>        __m128i mask_u2 = _mm_set_epi8((char)128, 14, (char)128, (char)128,   (char)128, 10, (char)128, (char)128,<br>                                        (char)128, 6, (char)128, (char)128,   (char)128, 2, (char)128, (char)128);<br>
 <br>        __m128i mask_v1 = _mm_set_epi8(12, (char)128, (char)128, (char)128,   8, (char)128, (char)128, (char)128,<br>                                        4, (char)128, (char)128, (char)128,   0, (char)128, (char)128, (char)128);<br>
 <br>        __m128i mask_v2 = _mm_set_epi8(14, (char)128, (char)128, (char)128,   10, (char)128, (char)128, (char)128,<br>                                        6, (char)128, (char)128, (char)128,   2, (char)128, (char)128, (char)128);<br>
 <br>        <br>        for (i=0; i<convert->width / 8; i++) {<br>                __m128i a1, a2, r, g, b, y, u, v, res;<br> <br>                a1 = _mm_loadu_si128((__m128i *)&p[i*32]);<br>                a2 = _mm_loadu_si128((__m128i *)&p[i*32 + 16]);<br>
 <br>                r = _mm_or_si128(_mm_and_si128(_mm_srli_si128(a1, 1), v_byte1), _mm_and_si128(_mm_slli_si128(a2, 1), v_byte3));<br>                g = _mm_or_si128(_mm_and_si128(_mm_srli_si128(a1, 2), v_byte1), _mm_and_si128(a2, v_byte3));<br>
                b = _mm_or_si128(_mm_and_si128(_mm_srli_si128(a1, 3), v_byte1), _mm_and_si128(_mm_srli_si128(a2, 1), v_byte3));<br> <br> <br>                y = _mm_add_epi16(<br>                        _mm_add_epi16(<br>
                                _mm_mullo_epi16(r, v_mat_00),<br>                                _mm_mullo_epi16(g, v_mat_01)),<br>                        _mm_add_epi16(<br>                                _mm_mullo_epi16(b, v_mat_02),<br>
                                v_mat_03));<br> <br>                y = _mm_and_si128(_mm_srai_epi16(y, 8), mask2);<br> <br>                u = _mm_add_epi16(<br>                        _mm_add_epi16(<br>                                _mm_mullo_epi16(r, v_mat_04),<br>
                                _mm_mullo_epi16(g, v_mat_05)),<br>                        _mm_add_epi16(<br>                                _mm_mullo_epi16(b, v_mat_06),<br>                                v_mat_07));<br> <br>
                u  = _mm_and_si128(_mm_srai_epi16(u, 8), mask2);<br> <br>                v = _mm_add_epi16(<br>                        _mm_add_epi16(<br>                                _mm_mullo_epi16(r, v_mat_08),<br>                                _mm_mullo_epi16(g, v_mat_09)),<br>
                        _mm_add_epi16(<br>                                _mm_mullo_epi16(b, v_mat_10),<br>                                v_mat_11));<br> <br>                v = _mm_and_si128(_mm_srai_epi16(v, 8), mask2);<br>
 <br> <br>                res = _mm_or_si128(_mm_shuffle_epi8(y, mask_y1), _mm_shuffle_epi8(u, mask_u1));<br>                res = _mm_or_si128(res, _mm_shuffle_epi8(v, mask_v1));<br> <br>                _mm_storeu_si128((__m128i *)&p[i*32], res);<br>
 <br>                res = _mm_or_si128(_mm_shuffle_epi8(y, mask_y2), _mm_shuffle_epi8(u, mask_u2));<br>               res = _mm_or_si128(res, _mm_shuffle_epi8(v, mask_v2));<br> <br>                _mm_storeu_si128((__m128i *)&p[i*32 + 16], res);<br>
        }<br>}<br></div></div>