Videoconvert needs to be optimized
Yaroslav Andrusyak
pontostroy at gmail.com
Sat Jun 21 13:30:52 PDT 2014
One good man gave me a sse3 version of videoconvert_convert_matrix8 (4-7
times faster)
Samples: 198K of event 'cycles', Event count (approx.): 112390215455
31.71% libx264.so.142 [.]
x264_add8x8_idct_avx2.skip_prologue
19.26% libx264.so.142 [.] 0x00000000000951bc
10.58% libx264.so.142 [.] x264_add8x8_idct_avx.skip_prologue
3.71% libgstvideoconvert.so [.] videoconvert_convert_matrix8
(3.71% vs 23.51%)
2.83% libx264.so.142 [.] x264_me_search_ref
2.82% orcexec.eWcXD2 [.] 0x0000000000000284
2.64% libc-2.18.so [.] __memcpy_sse2_unaligned
2.24% libgstvideo-1.0.so.0.204.0 [.] video_chroma_down_v2_guint8
2.22% libgstvideo-1.0.so.0.204.0 [.] video_chroma_down_h2_guint8
2.20% libx264.so.142 [.]
x264_macroblock_cache_load_progressive
1.95% libx264.so.142 [.] x264_sub8x8_dct_avx.skip_prologue
1.55% libx264.so.142 [.] x264_macroblock_analyse
0.97% libx264.so.142 [.] x264_macroblock_encode
0.94% libx264.so.142 [.] x264_macroblock_cache_save
0.90% libx264.so.142 [.] x264_mb_predict_mv_direct16x16
0.83% libx264.so.142 [.] x264_mb_predict_mv_ref16x16
0.62% perf [.] 0x0000000000067844
0.59% libx264.so.142 [.] x264_mb_encode_chroma
0.39% libx264.so.142 [.] x264_macroblock_probe_skip
it is possible add videoconvert_convert_matrix8_sse in upstream?
static void
videoconvert_convert_matrix8 (VideoConvert * convert, gpointer pixels)
{
int i,j;
guint8 *p = pixels;
__m128i v_byte1 = _mm_set1_epi32(0x000000ff);
__m128i v_byte3 = _mm_set1_epi32(0x00ff0000);
__m128i v_mat_00 = _mm_set1_epi16((short
int)convert->cmatrix[0][0]);
__m128i v_mat_01 = _mm_set1_epi16((short
int)convert->cmatrix[0][1]);
__m128i v_mat_02 = _mm_set1_epi16((short
int)convert->cmatrix[0][2]);
__m128i v_mat_03 = _mm_set1_epi16((short
int)convert->cmatrix[0][3]);
__m128i v_mat_04 = _mm_set1_epi16((short
int)convert->cmatrix[1][0]);
__m128i v_mat_05 = _mm_set1_epi16((short
int)convert->cmatrix[1][1]);
__m128i v_mat_06 = _mm_set1_epi16((short
int)convert->cmatrix[1][2]);
__m128i v_mat_07 = _mm_set1_epi16((short
int)convert->cmatrix[1][3]);
__m128i v_mat_08 = _mm_set1_epi16((short
int)convert->cmatrix[2][0]);
__m128i v_mat_09 = _mm_set1_epi16((short
int)convert->cmatrix[2][1]);
__m128i v_mat_10 = _mm_set1_epi16((short
int)convert->cmatrix[2][2]);
__m128i v_mat_11 = _mm_set1_epi16((short
int)convert->cmatrix[2][3]);
__m128i mask2 = _mm_set1_epi32(0x00ff00ff);
__m128i mask_y1 = _mm_set_epi8((char)128, (char)128, 12, (char)128,
(char)128, (char)128, 8, (char)128,
(char)128, (char)128, 4, (char)128,
(char)128, (char)128, 0, (char)128);
__m128i mask_y2 = _mm_set_epi8((char)128, (char)128, 14,
(char)128, (char)128, (char)128, 10, (char)128,
(char)128, (char)128, 6, (char)128,
(char)128, (char)128, 2, (char)128);
__m128i mask_u1 = _mm_set_epi8((char)128, 12, (char)128, (char)128,
(char)128, 8, (char)128, (char)128,
(char)128, 4, (char)128, (char)128,
(char)128, 0, (char)128, (char)128);
__m128i mask_u2 = _mm_set_epi8((char)128, 14, (char)128, (char)128,
(char)128, 10, (char)128, (char)128,
(char)128, 6, (char)128, (char)128,
(char)128, 2, (char)128, (char)128);
__m128i mask_v1 = _mm_set_epi8(12, (char)128, (char)128, (char)128,
8, (char)128, (char)128, (char)128,
4, (char)128, (char)128, (char)128,
0, (char)128, (char)128, (char)128);
__m128i mask_v2 = _mm_set_epi8(14, (char)128, (char)128, (char)128,
10, (char)128, (char)128, (char)128,
6, (char)128, (char)128, (char)128,
2, (char)128, (char)128, (char)128);
for (i=0; i<convert->width / 8; i++) {
__m128i a1, a2, r, g, b, y, u, v, res;
a1 = _mm_loadu_si128((__m128i *)&p[i*32]);
a2 = _mm_loadu_si128((__m128i *)&p[i*32 + 16]);
r = _mm_or_si128(_mm_and_si128(_mm_srli_si128(a1, 1),
v_byte1), _mm_and_si128(_mm_slli_si128(a2, 1), v_byte3));
g = _mm_or_si128(_mm_and_si128(_mm_srli_si128(a1, 2),
v_byte1), _mm_and_si128(a2, v_byte3));
b = _mm_or_si128(_mm_and_si128(_mm_srli_si128(a1, 3),
v_byte1), _mm_and_si128(_mm_srli_si128(a2, 1), v_byte3));
y = _mm_add_epi16(
_mm_add_epi16(
_mm_mullo_epi16(r, v_mat_00),
_mm_mullo_epi16(g, v_mat_01)),
_mm_add_epi16(
_mm_mullo_epi16(b, v_mat_02),
v_mat_03));
y = _mm_and_si128(_mm_srai_epi16(y, 8), mask2);
u = _mm_add_epi16(
_mm_add_epi16(
_mm_mullo_epi16(r, v_mat_04),
_mm_mullo_epi16(g, v_mat_05)),
_mm_add_epi16(
_mm_mullo_epi16(b, v_mat_06),
v_mat_07));
u = _mm_and_si128(_mm_srai_epi16(u, 8), mask2);
v = _mm_add_epi16(
_mm_add_epi16(
_mm_mullo_epi16(r, v_mat_08),
_mm_mullo_epi16(g, v_mat_09)),
_mm_add_epi16(
_mm_mullo_epi16(b, v_mat_10),
v_mat_11));
v = _mm_and_si128(_mm_srai_epi16(v, 8), mask2);
res = _mm_or_si128(_mm_shuffle_epi8(y, mask_y1),
_mm_shuffle_epi8(u, mask_u1));
res = _mm_or_si128(res, _mm_shuffle_epi8(v, mask_v1));
_mm_storeu_si128((__m128i *)&p[i*32], res);
res = _mm_or_si128(_mm_shuffle_epi8(y, mask_y2),
_mm_shuffle_epi8(u, mask_u2));
res = _mm_or_si128(res, _mm_shuffle_epi8(v, mask_v2));
_mm_storeu_si128((__m128i *)&p[i*32 + 16], res);
}
}
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.freedesktop.org/archives/gstreamer-devel/attachments/20140621/04e21485/attachment.html>
More information about the gstreamer-devel
mailing list