<div dir="ltr">CPU time of this pipeline<div>gst-launch-1.0 -e ximagesrc display-name=:0 use-damage=0 ! multiqueue ! video/x-raw,format=BGRx ! videoconvert ! video/x-raw,format=I420,framerate=30/1 ! multiqueue ! vaapiencode_h264  ! vaapiparse_h264 ! multiqueue ! matroskamux name=muxer muxer. ! progressreport name=Rec_time ! filesink location=/disk/tmp//rec_2014-10-05_123757.mkv</div><div>and then</div><div><div>perf stat -p `pidof gst-launch-1.0`</div></div><div><br></div><div><br></div></div><div class="gmail_extra"><br><div class="gmail_quote">2014-10-05 16:18 GMT+03:00 Tim Müller <span dir="ltr"><<a href="mailto:tim@centricular.com" target="_blank">tim@centricular.com</a>></span>:<br><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">On Sun, 2014-10-05 at 12:04 +0300, Yaroslav Andrusyak wrote:<br>
<br>
Hi,<br>
<br>
I'm a bit confused by these numbers. What exactly did you measure here?<br>
<br>
Cheers<br>
 -Tim<br>
<div class="HOEnZb"><div class="h5"><br>
> Now i finished testing new ORC verison<br>
> of videoconvert_convert_matrix8.<br>
> gst 1.4.3<br>
>  Performance counter stats for process id '1899':<br>
><br>
><br>
>       15064.538253      task-clock (msec)         #    0.548 CPUs<br>
> utilized           [100.00%]<br>
>             13,397      context-switches          #    0.889 K/sec<br>
> [100.00%]<br>
>              1,841      cpu-migrations            #    0.122 K/sec<br>
> [100.00%]<br>
>                 15      page-faults               #    0.001 K/sec<br>
><br>
>     47,656,623,006      cycles                    #    3.163 GHz<br>
> [100.00%]<br>
>     13,852,194,083      stalled-cycles-frontend   #   29.07% frontend<br>
> cycles idle    [100.00%]<br>
>    <not supported>      stalled-cycles-backend<br>
>    128,121,252,488      instructions              #    2.69  insns per<br>
> cycle<br>
>                                                   #    0.11  stalled<br>
> cycles per insn [100.00%]<br>
>      9,185,407,626      branches                  #  609.737 M/sec<br>
> [100.00%]<br>
>         14,321,378      branch-misses             #    0.16% of all<br>
> branches<br>
><br>
><br>
>       27.470107935 seconds time elapsed<br>
><br>
><br>
><br>
><br>
><br>
><br>
> gst 1.4.3 + SSE patch<br>
> Performance counter stats for process id '1816': 6684.168028<br>
> task-clock (msec) # 0.222 CPUs utilized 12,919 context-switches #<br>
> 0.002 M/sec 994 cpu-migrations # 0.149 K/sec 15 page-faults # 0.002<br>
> K/sec 21,026,251,117 cycles # 3.146 GHz 6,601,947,003<br>
> stalled-cycles-frontend # 31.40% frontend cycles idle <not supported><br>
> stalled-cycles-backend 50,980,515,172 instructions # 2.42 insns per<br>
> cycle # 0.13 stalled cycles per insn 2,629,078,867 branches # 393.329<br>
> M/sec 14,069,001 branch-misses # 0.54% of all branches 30.173457857<br>
> seconds time elapsed<br>
><br>
><br>
><br>
> gst 1.5 ORC<br>
> Performance counter stats for process id '31045':<br>
><br>
><br>
>       13233.834604      task-clock (msec)         #    0.297 CPUs<br>
> utilized<br>
>             25,289      context-switches          #    0.002 M/sec<br>
><br>
>              2,173      cpu-migrations            #    0.164 K/sec<br>
><br>
>                 22      page-faults               #    0.002 K/sec<br>
><br>
>     41,150,975,897      cycles                    #    3.110 GHz<br>
><br>
>     13,982,410,478      stalled-cycles-frontend   #   33.98% frontend<br>
> cycles idle<br>
>    <not supported>      stalled-cycles-backend<br>
>    101,877,265,752      instructions              #    2.48  insns per<br>
> cycle<br>
>                                                   #    0.14  stalled<br>
> cycles per insn<br>
>      5,199,227,991      branches                  #  392.874 M/sec<br>
><br>
>         22,454,841      branch-misses             #    0.43% of all<br>
> branches<br>
><br>
><br>
>       44.537258699 seconds time elapsed<br>
><br>
><br>
><br>
><br>
><br>
><br>
> Good job<br>
><br>
> 2014-06-23 8:49 GMT+03:00 Yaroslav Andrusyak <<a href="mailto:pontostroy@gmail.com">pontostroy@gmail.com</a>>:<br>
>         Other multi-platform version use  tables instead of multipl<br>
>         and all components in one 64-bit number<br>
><br>
><br>
>         static int op=0;<br>
>         static int64_t t_r3[256], t_g3[256], t_b3[256];<br>
><br>
>         static void<br>
>         videoconvert_convert_matrix8 (VideoConvert * convert, gpointer p)<br>
>         {<br>
>          while (op<2) {<br>
>         int i, j;<br>
>               int k_r[] = { convert->cmatrix[0][0], convert->cmatrix[1][0], convert->cmatrix[2][0] };<br>
>               int k_g[] = { convert->cmatrix[0][1], convert->cmatrix[1][1], convert->cmatrix[2][1] };<br>
>               int k_b[] = { convert->cmatrix[0][2], convert->cmatrix[1][2], convert->cmatrix[2][2] };<br>
><br>
><br>
>               for (i = 0; i <= 255; i++)<br>
>               {<br>
>                       int64_t r = 0, g = 0, b = 0;<br>
>                       for (j = 0; j <= 2; j++)<br>
>                       {<br>
>                               r = (r << 16) + k_r[j] * i;<br>
>                               g = (g << 16) + k_g[j] * i;<br>
>                               b = (b << 16) + k_b[j] * i;<br>
>                       }<br>
>                       t_r3[i] = r;<br>
>                       t_g3[i] = g;<br>
>                       t_b3[i] = b;<br>
>                       op=2;<br>
>               }<br>
>          }<br>
>           int t;<br>
>               uint8_t r, g, b;<br>
>               guint8 *pixels = p;<br>
>               int64_t c = ((int64_t) convert->cmatrix[0][3] << 32)<br>
>                               + ((int64_t) convert->cmatrix[1][3] << 16)<br>
>                               + ((int64_t) convert->cmatrix[2][3] << 0);<br>
><br>
>               for (t = 0; t < convert->width * 4; t += 4)<br>
>               {<br>
>                       r = pixels[t + 1];<br>
>                       g = pixels[t + 2];<br>
>                       b = pixels[t + 3];<br>
><br>
>                       int64_t x3 = t_r3[r] + t_g3[g] + t_b3[b] + c;<br>
><br>
>                       pixels[t + 1] = x3 >> 40;<br>
>                       pixels[t + 2] = x3 >> 24;<br>
>                       pixels[t + 3] = x3 >> 8;<br>
>               }<br>
>         }<br>
><br>
><br>
>         and result on i5-3330 3GHz<br>
><br>
><br>
>         x86 stock\table64<br>
><br>
>         Performance counter stats for process id '2427':<br>
><br>
>               24441.388683 task-clock                #    0.555 CPUs<br>
>         utilized<br>
>                     16,117 context-switches          #    0.659 K/sec<br>
><br>
>                        219 cpu-migrations            #    0.009 K/sec<br>
><br>
>                      5,233 page-faults               #    0.214 K/sec<br>
><br>
>             77,409,381,175 cycles                    #    3.167 GHz<br>
><br>
>             26,292,718,337 stalled-cycles-frontend   #   33.97%<br>
>         frontend cycles idle<br>
>            <not supported> stalled-cycles-backend<br>
>            197,607,036,204 instructions              #    2.55  insns<br>
>         per cycle<br>
>                                                      #    0.13<br>
>          stalled cycles per insn<br>
>             12,292,242,916 branches                  #  502.927 M/sec<br>
><br>
>                 21,693,862 branch-misses             #    0.18% of all<br>
>         branches<br>
>            <not supported> L1-dcache-loads:HG<br>
>                481,143,745 L1-dcache-load-misses:HG  #    0.00% of all<br>
>         L1-dcache hits<br>
>                239,777,528 LLC-loads:HG              #    9.810 M/sec<br>
><br>
>            <not supported> LLC-load-misses:HG<br>
><br>
>               44.023759316 seconds time elapsed<br>
><br>
><br>
><br>
>         Performance counter stats for process id '2822':<br>
><br>
>               16221.909026 task-clock                #    0.339 CPUs<br>
>         utilized<br>
>                     15,932 context-switches          #    0.982 K/sec<br>
><br>
>                      1,189 cpu-migrations            #    0.073 K/sec<br>
><br>
>                      3,315 page-faults               #    0.204 K/sec<br>
><br>
>             51,320,206,381 cycles                    #    3.164 GHz<br>
><br>
>             13,246,360,560 stalled-cycles-frontend   #   25.81%<br>
>         frontend cycles idle<br>
>            <not supported> stalled-cycles-backend<br>
>            127,814,464,535 instructions              #    2.49  insns<br>
>         per cycle<br>
>                                                      #    0.10<br>
>          stalled cycles per insn<br>
>              5,902,096,929 branches                  #  363.835 M/sec<br>
><br>
>                 21,703,222 branch-misses             #    0.37% of all<br>
>         branches<br>
>            <not supported> L1-dcache-loads:HG<br>
>                608,012,138 L1-dcache-load-misses:HG  #    0.00% of all<br>
>         L1-dcache hits<br>
>                250,741,676 LLC-loads:HG              #   15.457 M/sec<br>
><br>
>            <not supported> LLC-load-misses:HG<br>
><br>
>               47.831511910 seconds time elapsed<br>
><br>
><br>
>         x86_64 stock\table64<br>
><br>
>         Performance counter stats for process id '3506':<br>
><br>
>               23258.800974 task-clock                #    0.455 CPUs<br>
>         utilized<br>
>                     18,130 context-switches          #    0.779 K/sec<br>
><br>
>                      2,288 cpu-migrations            #    0.098 K/sec<br>
><br>
>                      1,331 page-faults               #    0.057 K/sec<br>
><br>
>             73,692,193,376 cycles                    #    3.168 GHz<br>
><br>
>             21,218,974,690 stalled-cycles-frontend   #   28.79%<br>
>         frontend cycles idle<br>
>            <not supported> stalled-cycles-backend<br>
>            198,710,985,363 instructions              #    2.70  insns<br>
>         per cycle<br>
>                                                      #    0.11<br>
>          stalled cycles per insn<br>
>             14,252,859,241 branches                  #  612.794 M/sec<br>
><br>
>                 22,748,214 branch-misses             #    0.16% of all<br>
>         branches<br>
>            <not supported> L1-dcache-loads:HG<br>
>                566,065,245 L1-dcache-load-misses:HG  #    0.00% of all<br>
>         L1-dcache hits<br>
>                267,280,364 LLC-loads:HG              #   11.492 M/sec<br>
><br>
>            <not supported> LLC-load-misses:HG<br>
><br>
>               51.148678220 seconds time elapsed<br>
><br>
>         Performance counter stats for process id '3379':<br>
><br>
><br>
><br>
><br>
><br>
>               12461.157685 task-clock                #    0.250 CPUs<br>
>         utilized<br>
>                      5,485 context-switches          #    0.001 M/sec<br>
><br>
>                        238 cpu-migrations            #    0.019 K/sec<br>
><br>
><br>
>                      8,634 page-faults               #    0.693 K/sec<br>
><br>
><br>
>             39,284,950,355 cycles                    #    3.153 GHz<br>
><br>
><br>
>              8,891,423,340 stalled-cycles-frontend   #   22.63%<br>
>         frontend cycles idle<br>
><br>
>            <not supported> stalled-cycles-backend<br>
><br>
><br>
>            105,668,751,265 instructions              #    2.69  insns<br>
>         per cycle<br>
><br>
>                                                      #    0.08<br>
>          stalled cycles per insn<br>
>              6,123,961,551 branches                  #  491.444 M/sec<br>
><br>
>                 20,472,272 branch-misses             #    0.33% of all<br>
>         branches<br>
>            <not supported> L1-dcache-loads:HG<br>
>                608,576,686 L1-dcache-load-misses:HG  #    0.00% of all<br>
>         L1-dcache hits<br>
>                259,895,252 LLC-loads:HG              #   20.856 M/sec<br>
><br>
>            <not supported> LLC-load-misses:HG<br>
><br>
>               49.792876350 seconds time elapsed<br>
><br>
><br>
><br>
><br>
><br>
><br>
>         2014-06-21 20:30 GMT+00:00 Yaroslav Andrusyak<br>
>         <<a href="mailto:pontostroy@gmail.com">pontostroy@gmail.com</a>>:<br>
>                 One good man gave me a  sse3 version<br>
>                 of videoconvert_convert_matrix8 (4-7 times faster)<br>
><br>
>                 Samples: 198K of event 'cycles', Event count<br>
>                 (approx.): 112390215455<br>
><br>
><br>
>                  31.71%  libx264.so.142               [.]<br>
>                 x264_add8x8_idct_avx2.skip_prologue<br>
>                  19.26%  libx264.so.142               [.]<br>
>                 0x00000000000951bc<br>
>                  10.58%  libx264.so.142               [.]<br>
>                 x264_add8x8_idct_avx.skip_prologue<br>
>                   3.71%  libgstvideoconvert.so        [.]<br>
>                 videoconvert_convert_matrix8         (3.71% vs  23.51%<br>
>                 )<br>
>                   2.83%  libx264.so.142               [.]<br>
>                 x264_me_search_ref<br>
>                   2.82%  orcexec.eWcXD2               [.]<br>
>                 0x0000000000000284<br>
>                   2.64%  <a href="http://libc-2.18.so" target="_blank">libc-2.18.so</a>                 [.]<br>
>                 __memcpy_sse2_unaligned<br>
>                   2.24%  libgstvideo-1.0.so.0.204.0   [.]<br>
>                 video_chroma_down_v2_guint8<br>
>                   2.22%  libgstvideo-1.0.so.0.204.0   [.]<br>
>                 video_chroma_down_h2_guint8<br>
>                   2.20%  libx264.so.142               [.]<br>
>                 x264_macroblock_cache_load_progressive<br>
>                   1.95%  libx264.so.142               [.]<br>
>                 x264_sub8x8_dct_avx.skip_prologue<br>
>                   1.55%  libx264.so.142               [.]<br>
>                 x264_macroblock_analyse<br>
>                   0.97%  libx264.so.142               [.]<br>
>                 x264_macroblock_encode<br>
>                   0.94%  libx264.so.142               [.]<br>
>                 x264_macroblock_cache_save<br>
>                   0.90%  libx264.so.142               [.]<br>
>                 x264_mb_predict_mv_direct16x16<br>
>                   0.83%  libx264.so.142               [.]<br>
>                 x264_mb_predict_mv_ref16x16<br>
>                   0.62%  perf                         [.]<br>
>                 0x0000000000067844<br>
>                   0.59%  libx264.so.142               [.]<br>
>                 x264_mb_encode_chroma<br>
>                   0.39%  libx264.so.142               [.]<br>
>                 x264_macroblock_probe_skip<br>
><br>
><br>
><br>
><br>
>                 it is possible add videoconvert_convert_matrix8_sse in<br>
>                 upstream?<br>
><br>
><br>
>                 static void<br>
>                 videoconvert_convert_matrix8 (VideoConvert * convert,<br>
>                 gpointer pixels)<br>
>                 {<br>
><br>
>                         int i,j;<br>
>                         guint8 *p = pixels;<br>
><br>
>                         __m128i v_byte1 = _mm_set1_epi32(0x000000ff);<br>
>                         __m128i v_byte3 = _mm_set1_epi32(0x00ff0000);<br>
>                         __m128i v_mat_00 = _mm_set1_epi16((short<br>
>                 int)convert->cmatrix[0][0]);<br>
>                         __m128i v_mat_01 = _mm_set1_epi16((short<br>
>                 int)convert->cmatrix[0][1]);<br>
>                         __m128i v_mat_02 = _mm_set1_epi16((short<br>
>                 int)convert->cmatrix[0][2]);<br>
>                         __m128i v_mat_03 = _mm_set1_epi16((short<br>
>                 int)convert->cmatrix[0][3]);<br>
>                         __m128i v_mat_04 = _mm_set1_epi16((short<br>
>                 int)convert->cmatrix[1][0]);<br>
>                         __m128i v_mat_05 = _mm_set1_epi16((short<br>
>                 int)convert->cmatrix[1][1]);<br>
>                         __m128i v_mat_06 = _mm_set1_epi16((short<br>
>                 int)convert->cmatrix[1][2]);<br>
>                         __m128i v_mat_07 = _mm_set1_epi16((short<br>
>                 int)convert->cmatrix[1][3]);<br>
>                         __m128i v_mat_08 = _mm_set1_epi16((short<br>
>                 int)convert->cmatrix[2][0]);<br>
>                         __m128i v_mat_09 = _mm_set1_epi16((short<br>
>                 int)convert->cmatrix[2][1]);<br>
>                         __m128i v_mat_10 = _mm_set1_epi16((short<br>
>                 int)convert->cmatrix[2][2]);<br>
>                         __m128i v_mat_11 = _mm_set1_epi16((short<br>
>                 int)convert->cmatrix[2][3]);<br>
><br>
>                         __m128i mask2   = _mm_set1_epi32(0x00ff00ff);<br>
><br>
>                         __m128i mask_y1 = _mm_set_epi8((char)128,<br>
>                 (char)128, 12, (char)128,   (char)128, (char)128, 8,<br>
>                 (char)128,<br>
>                                                         (char)128,<br>
>                 (char)128, 4, (char)128,   (char)128, (char)128, 0,<br>
>                 (char)128);<br>
><br>
>                         __m128i mask_y2 = _mm_set_epi8((char)128,<br>
>                 (char)128, 14,  (char)128,  (char)128, (char)128, 10,<br>
>                 (char)128,<br>
>                                                         (char)128,<br>
>                 (char)128, 6, (char)128,   (char)128, (char)128, 2,<br>
>                 (char)128);<br>
><br>
>                         __m128i mask_u1 = _mm_set_epi8((char)128, 12,<br>
>                 (char)128, (char)128,   (char)128, 8, (char)128,<br>
>                 (char)128,<br>
>                                                         (char)128, 4,<br>
>                 (char)128, (char)128,   (char)128, 0, (char)128,<br>
>                 (char)128);<br>
><br>
>                         __m128i mask_u2 = _mm_set_epi8((char)128, 14,<br>
>                 (char)128, (char)128,   (char)128, 10, (char)128,<br>
>                 (char)128,<br>
>                                                         (char)128, 6,<br>
>                 (char)128, (char)128,   (char)128, 2, (char)128,<br>
>                 (char)128);<br>
><br>
>                         __m128i mask_v1 = _mm_set_epi8(12, (char)128,<br>
>                 (char)128, (char)128,   8, (char)128, (char)128,<br>
>                 (char)128,<br>
>                                                         4, (char)128,<br>
>                 (char)128, (char)128,   0, (char)128, (char)128,<br>
>                 (char)128);<br>
><br>
>                         __m128i mask_v2 = _mm_set_epi8(14, (char)128,<br>
>                 (char)128, (char)128,   10, (char)128, (char)128,<br>
>                 (char)128,<br>
>                                                         6, (char)128,<br>
>                 (char)128, (char)128,   2, (char)128, (char)128,<br>
>                 (char)128);<br>
><br>
><br>
>                         for (i=0; i<convert->width / 8; i++) {<br>
>                                 __m128i a1, a2, r, g, b, y, u, v, res;<br>
><br>
>                                 a1 = _mm_loadu_si128((__m128i<br>
>                 *)&p[i*32]);<br>
>                                 a2 = _mm_loadu_si128((__m128i<br>
>                 *)&p[i*32 + 16]);<br>
><br>
>                                 r =<br>
>                 _mm_or_si128(_mm_and_si128(_mm_srli_si128(a1, 1),<br>
>                 v_byte1), _mm_and_si128(_mm_slli_si128(a2, 1),<br>
>                 v_byte3));<br>
>                                 g =<br>
>                 _mm_or_si128(_mm_and_si128(_mm_srli_si128(a1, 2),<br>
>                 v_byte1), _mm_and_si128(a2, v_byte3));<br>
>                                 b =<br>
>                 _mm_or_si128(_mm_and_si128(_mm_srli_si128(a1, 3),<br>
>                 v_byte1), _mm_and_si128(_mm_srli_si128(a2, 1),<br>
>                 v_byte3));<br>
><br>
><br>
>                                 y = _mm_add_epi16(<br>
>                                         _mm_add_epi16(<br>
>                                                 _mm_mullo_epi16(r,<br>
>                 v_mat_00),<br>
>                                                 _mm_mullo_epi16(g,<br>
>                 v_mat_01)),<br>
>                                         _mm_add_epi16(<br>
>                                                 _mm_mullo_epi16(b,<br>
>                 v_mat_02),<br>
>                                                 v_mat_03));<br>
><br>
>                                 y = _mm_and_si128(_mm_srai_epi16(y,<br>
>                 8), mask2);<br>
><br>
>                                 u = _mm_add_epi16(<br>
>                                         _mm_add_epi16(<br>
>                                                 _mm_mullo_epi16(r,<br>
>                 v_mat_04),<br>
>                                                 _mm_mullo_epi16(g,<br>
>                 v_mat_05)),<br>
>                                         _mm_add_epi16(<br>
>                                                 _mm_mullo_epi16(b,<br>
>                 v_mat_06),<br>
>                                                 v_mat_07));<br>
><br>
>                                 u  = _mm_and_si128(_mm_srai_epi16(u,<br>
>                 8), mask2);<br>
><br>
>                                 v = _mm_add_epi16(<br>
>                                         _mm_add_epi16(<br>
>                                                 _mm_mullo_epi16(r,<br>
>                 v_mat_08),<br>
>                                                 _mm_mullo_epi16(g,<br>
>                 v_mat_09)),<br>
>                                         _mm_add_epi16(<br>
>                                                 _mm_mullo_epi16(b,<br>
>                 v_mat_10),<br>
>                                                 v_mat_11));<br>
><br>
>                                 v = _mm_and_si128(_mm_srai_epi16(v,<br>
>                 8), mask2);<br>
><br>
><br>
>                                 res = _mm_or_si128(_mm_shuffle_epi8(y,<br>
>                 mask_y1), _mm_shuffle_epi8(u, mask_u1));<br>
>                                 res = _mm_or_si128(res,<br>
>                 _mm_shuffle_epi8(v, mask_v1));<br>
><br>
>                                 _mm_storeu_si128((__m128i *)&p[i*32],<br>
>                 res);<br>
><br>
>                                 res = _mm_or_si128(_mm_shuffle_epi8(y,<br>
>                 mask_y2), _mm_shuffle_epi8(u, mask_u2));<br>
>                                res = _mm_or_si128(res,<br>
>                 _mm_shuffle_epi8(v, mask_v2));<br>
><br>
>                                 _mm_storeu_si128((__m128i *)&p[i*32 +<br>
>                 16], res);<br>
>                         }<br>
>                 }<br>
><br>
><br>
><br>
><br>
><br>
</div></div><span class="im HOEnZb">> _______________________________________________<br>
> gstreamer-devel mailing list<br>
> <a href="mailto:gstreamer-devel@lists.freedesktop.org">gstreamer-devel@lists.freedesktop.org</a><br>
> <a href="http://lists.freedesktop.org/mailman/listinfo/gstreamer-devel" target="_blank">http://lists.freedesktop.org/mailman/listinfo/gstreamer-devel</a><br>
<br>
</span><span class="HOEnZb"><font color="#888888">--<br>
Tim Müller, Centricular Ltd - <a href="http://www.centricular.com" target="_blank">http://www.centricular.com</a><br>
</font></span><div class="HOEnZb"><div class="h5"><br>
_______________________________________________<br>
gstreamer-devel mailing list<br>
<a href="mailto:gstreamer-devel@lists.freedesktop.org">gstreamer-devel@lists.freedesktop.org</a><br>
<a href="http://lists.freedesktop.org/mailman/listinfo/gstreamer-devel" target="_blank">http://lists.freedesktop.org/mailman/listinfo/gstreamer-devel</a><br>
</div></div></blockquote></div><br></div>