Videoconvert needs to be optimized

Sat Jun 21 13:30:52 PDT 2014

One good man gave me a  sse3 version of videoconvert_convert_matrix8 (4-7
times faster)

Samples: 198K of event 'cycles', Event count (approx.): 112390215455

 31.71%  libx264.so.142               [.]
x264_add8x8_idct_avx2.skip_prologue
 19.26%  libx264.so.142               [.] 0x00000000000951bc
 10.58%  libx264.so.142               [.] x264_add8x8_idct_avx.skip_prologue
  3.71%  libgstvideoconvert.so        [.] videoconvert_convert_matrix8
    (3.71% vs  23.51%)
  2.83%  libx264.so.142               [.] x264_me_search_ref
  2.82%  orcexec.eWcXD2               [.] 0x0000000000000284
  2.64%  libc-2.18.so                 [.] __memcpy_sse2_unaligned
  2.24%  libgstvideo-1.0.so.0.204.0   [.] video_chroma_down_v2_guint8
  2.22%  libgstvideo-1.0.so.0.204.0   [.] video_chroma_down_h2_guint8
  2.20%  libx264.so.142               [.]
x264_macroblock_cache_load_progressive
  1.95%  libx264.so.142               [.] x264_sub8x8_dct_avx.skip_prologue
  1.55%  libx264.so.142               [.] x264_macroblock_analyse
  0.97%  libx264.so.142               [.] x264_macroblock_encode
  0.94%  libx264.so.142               [.] x264_macroblock_cache_save
  0.90%  libx264.so.142               [.] x264_mb_predict_mv_direct16x16
  0.83%  libx264.so.142               [.] x264_mb_predict_mv_ref16x16
  0.62%  perf                         [.] 0x0000000000067844
  0.59%  libx264.so.142               [.] x264_mb_encode_chroma
  0.39%  libx264.so.142               [.] x264_macroblock_probe_skip

it is possible add videoconvert_convert_matrix8_sse in upstream?

static void
videoconvert_convert_matrix8 (VideoConvert * convert, gpointer pixels)
{
        int i,j;
        guint8 *p = pixels;

        __m128i v_byte1 = _mm_set1_epi32(0x000000ff);
        __m128i v_byte3 = _mm_set1_epi32(0x00ff0000);
        __m128i v_mat_00 = _mm_set1_epi16((short
int)convert->cmatrix[0][0]);
        __m128i v_mat_01 = _mm_set1_epi16((short
int)convert->cmatrix[0][1]);
        __m128i v_mat_02 = _mm_set1_epi16((short
int)convert->cmatrix[0][2]);
        __m128i v_mat_03 = _mm_set1_epi16((short
int)convert->cmatrix[0][3]);
        __m128i v_mat_04 = _mm_set1_epi16((short
int)convert->cmatrix[1][0]);
        __m128i v_mat_05 = _mm_set1_epi16((short
int)convert->cmatrix[1][1]);
        __m128i v_mat_06 = _mm_set1_epi16((short
int)convert->cmatrix[1][2]);
        __m128i v_mat_07 = _mm_set1_epi16((short
int)convert->cmatrix[1][3]);
        __m128i v_mat_08 = _mm_set1_epi16((short
int)convert->cmatrix[2][0]);
        __m128i v_mat_09 = _mm_set1_epi16((short
int)convert->cmatrix[2][1]);
        __m128i v_mat_10 = _mm_set1_epi16((short
int)convert->cmatrix[2][2]);
        __m128i v_mat_11 = _mm_set1_epi16((short
int)convert->cmatrix[2][3]);

        __m128i mask2   = _mm_set1_epi32(0x00ff00ff);

        __m128i mask_y1 = _mm_set_epi8((char)128, (char)128, 12, (char)128,
  (char)128, (char)128, 8, (char)128,
                                        (char)128, (char)128, 4, (char)128,
  (char)128, (char)128, 0, (char)128);

        __m128i mask_y2 = _mm_set_epi8((char)128, (char)128, 14,
 (char)128,  (char)128, (char)128, 10, (char)128,
                                        (char)128, (char)128, 6, (char)128,
  (char)128, (char)128, 2, (char)128);

        __m128i mask_u1 = _mm_set_epi8((char)128, 12, (char)128, (char)128,
  (char)128, 8, (char)128, (char)128,
                                        (char)128, 4, (char)128, (char)128,
  (char)128, 0, (char)128, (char)128);

        __m128i mask_u2 = _mm_set_epi8((char)128, 14, (char)128, (char)128,
  (char)128, 10, (char)128, (char)128,
                                        (char)128, 6, (char)128, (char)128,
  (char)128, 2, (char)128, (char)128);

        __m128i mask_v1 = _mm_set_epi8(12, (char)128, (char)128, (char)128,
  8, (char)128, (char)128, (char)128,
                                        4, (char)128, (char)128, (char)128,
  0, (char)128, (char)128, (char)128);

        __m128i mask_v2 = _mm_set_epi8(14, (char)128, (char)128, (char)128,
  10, (char)128, (char)128, (char)128,
                                        6, (char)128, (char)128, (char)128,
  2, (char)128, (char)128, (char)128);

        for (i=0; i<convert->width / 8; i++) {
                __m128i a1, a2, r, g, b, y, u, v, res;

                a1 = _mm_loadu_si128((__m128i *)&p[i*32]);
                a2 = _mm_loadu_si128((__m128i *)&p[i*32 + 16]);

                r = _mm_or_si128(_mm_and_si128(_mm_srli_si128(a1, 1),
v_byte1), _mm_and_si128(_mm_slli_si128(a2, 1), v_byte3));
                g = _mm_or_si128(_mm_and_si128(_mm_srli_si128(a1, 2),
v_byte1), _mm_and_si128(a2, v_byte3));
                b = _mm_or_si128(_mm_and_si128(_mm_srli_si128(a1, 3),
v_byte1), _mm_and_si128(_mm_srli_si128(a2, 1), v_byte3));

                y = _mm_add_epi16(
                        _mm_add_epi16(
                                _mm_mullo_epi16(r, v_mat_00),
                                _mm_mullo_epi16(g, v_mat_01)),
                        _mm_add_epi16(
                                _mm_mullo_epi16(b, v_mat_02),
                                v_mat_03));

                y = _mm_and_si128(_mm_srai_epi16(y, 8), mask2);

                u = _mm_add_epi16(
                        _mm_add_epi16(
                                _mm_mullo_epi16(r, v_mat_04),
                                _mm_mullo_epi16(g, v_mat_05)),
                        _mm_add_epi16(
                                _mm_mullo_epi16(b, v_mat_06),
                                v_mat_07));

                u  = _mm_and_si128(_mm_srai_epi16(u, 8), mask2);

                v = _mm_add_epi16(
                        _mm_add_epi16(
                                _mm_mullo_epi16(r, v_mat_08),
                                _mm_mullo_epi16(g, v_mat_09)),
                        _mm_add_epi16(
                                _mm_mullo_epi16(b, v_mat_10),
                                v_mat_11));

                v = _mm_and_si128(_mm_srai_epi16(v, 8), mask2);

                res = _mm_or_si128(_mm_shuffle_epi8(y, mask_y1),
_mm_shuffle_epi8(u, mask_u1));
                res = _mm_or_si128(res, _mm_shuffle_epi8(v, mask_v1));

                _mm_storeu_si128((__m128i *)&p[i*32], res);

                res = _mm_or_si128(_mm_shuffle_epi8(y, mask_y2),
_mm_shuffle_epi8(u, mask_u2));
               res = _mm_or_si128(res, _mm_shuffle_epi8(v, mask_v2));

                _mm_storeu_si128((__m128i *)&p[i*32 + 16], res);
        }
}
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.freedesktop.org/archives/gstreamer-devel/attachments/20140621/04e21485/attachment.html>