<div dir="ltr">Now i finished testing new ORC verison of <span style="color:rgb(0,0,0);font-family:'Droid Sans Mono',monospace,sans-serif;font-size:1em;white-space:pre-wrap">videoconvert_convert_matrix8.</span><div><font color="#000000" face="Droid Sans Mono, monospace, sans-serif"><span style="white-space:pre-wrap">gst 1.4.3</span></font></div><div><font color="#000000" face="Droid Sans Mono, monospace, sans-serif"><div style><span style="white-space:pre-wrap"> Performance counter stats for process id '1899':</span></div><div style><span style="white-space:pre-wrap"><br></span></div><div style><span style="white-space:pre-wrap">      15064.538253      task-clock (msec)         #    0.548 CPUs utilized           [100.00%]</span></div><div style><span style="white-space:pre-wrap">            13,397      context-switches          #    0.889 K/sec                   [100.00%]</span></div><div style><span style="white-space:pre-wrap">             1,841      cpu-migrations            #    0.122 K/sec                   [100.00%]</span></div><div style><span style="white-space:pre-wrap">                15      page-faults               #    0.001 K/sec                  </span></div><div style><span style="white-space:pre-wrap">    47,656,623,006      cycles                    #    3.163 GHz                     [100.00%]</span></div><div style><span style="white-space:pre-wrap">    13,852,194,083      stalled-cycles-frontend   #   29.07% frontend cycles idle    [100.00%]</span></div><div style><span style="white-space:pre-wrap">   <not supported>      stalled-cycles-backend   </span></div><div style><span style="white-space:pre-wrap">   128,121,252,488      instructions              #    2.69  insns per cycle        </span></div><div style><span style="white-space:pre-wrap">                                                  #    0.11  stalled cycles per insn [100.00%]</span></div><div style><span style="white-space:pre-wrap">     9,185,407,626      branches                  #  609.737 M/sec                   [100.00%]</span></div><div style><span style="white-space:pre-wrap">        14,321,378      branch-misses             #    0.16% of all branches        </span></div><div style><span style="white-space:pre-wrap"><br></span></div><div style><span style="white-space:pre-wrap">      27.470107935 seconds time elapsed</span></div><div style="white-space:pre-wrap"><br></div><div style="white-space:pre-wrap"><br></div><div style="white-space:pre-wrap"><br></div><div style="white-space:pre-wrap">gst 1.4.3 + SSE patch</div><div style><span style="white-space:pre-wrap"> Performance counter stats for process id '1816':

       6684.168028      task-clock (msec)         #    0.222 CPUs utilized          
            12,919      context-switches          #    0.002 M/sec                  
               994      cpu-migrations            #    0.149 K/sec                  
                15      page-faults               #    0.002 K/sec                  
    21,026,251,117      cycles                    #    3.146 GHz                    
     6,601,947,003      stalled-cycles-frontend   #   31.40% frontend cycles idle   
   <not supported>      stalled-cycles-backend   
    50,980,515,172      instructions              #    2.42  insns per cycle        
                                                  #    0.13  stalled cycles per insn
     2,629,078,867      branches                  #  393.329 M/sec                  
        14,069,001      branch-misses             #    0.54% of all branches        

      30.173457857 seconds time elapsed</span></div><span style="white-space:pre-wrap"> </span></font></div><div><font color="#000000" face="Droid Sans Mono, monospace, sans-serif"><span style="white-space:pre-wrap"><br></span></font></div><div><font color="#000000" face="Droid Sans Mono, monospace, sans-serif"><span style="white-space:pre-wrap">gst 1.5 ORC</span></font></div><div><font color="#000000" face="Droid Sans Mono, monospace, sans-serif"><div style><span style="white-space:pre-wrap">Performance counter stats for process id '31045':</span></div><div style><span style="white-space:pre-wrap"><br></span></div><div style><span style="white-space:pre-wrap">      13233.834604      task-clock (msec)         #    0.297 CPUs utilized          </span></div><div style><span style="white-space:pre-wrap">            25,289      context-switches          #    0.002 M/sec                  </span></div><div style><span style="white-space:pre-wrap">             2,173      cpu-migrations            #    0.164 K/sec                  </span></div><div style><span style="white-space:pre-wrap">                22      page-faults               #    0.002 K/sec                  </span></div><div style><span style="white-space:pre-wrap">    41,150,975,897      cycles                    #    3.110 GHz                    </span></div><div style><span style="white-space:pre-wrap">    13,982,410,478      stalled-cycles-frontend   #   33.98% frontend cycles idle   </span></div><div style><span style="white-space:pre-wrap">   <not supported>      stalled-cycles-backend   </span></div><div style><span style="white-space:pre-wrap">   101,877,265,752      instructions              #    2.48  insns per cycle        </span></div><div style><span style="white-space:pre-wrap">                                                  #    0.14  stalled cycles per insn</span></div><div style><span style="white-space:pre-wrap">     5,199,227,991      branches                  #  392.874 M/sec                  </span></div><div style><span style="white-space:pre-wrap">        22,454,841      branch-misses             #    0.43% of all branches        </span></div><div style><span style="white-space:pre-wrap"><br></span></div><div style><span style="white-space:pre-wrap">      44.537258699 seconds time elapsed</span></div><div style="white-space:pre-wrap"><br></div><div style="white-space:pre-wrap"><br></div></font><div><div><span style="color:rgb(0,0,0);font-family:'Droid Sans Mono',monospace,sans-serif;font-size:1em;white-space:pre-wrap"><br></span></div><div><span style="color:rgb(0,0,0);font-family:'Droid Sans Mono',monospace,sans-serif;font-size:1em;white-space:pre-wrap">Good job</span></div></div></div></div><div class="gmail_extra"><br><div class="gmail_quote">2014-06-23 8:49 GMT+03:00 Yaroslav Andrusyak <span dir="ltr"><<a href="mailto:pontostroy@gmail.com" target="_blank">pontostroy@gmail.com</a>></span>:<br><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"><div dir="ltr">Other multi-platform version use  tables instead of multipl and all components in one 64-bit number<br><div class="gmail_extra"><br></div><div class="gmail_extra"><pre style="white-space:pre-wrap;word-wrap:break-word;margin:0.5em 1em;font-size:1rem"><code style="border:0px;margin-top:0.5em;margin-bottom:0.5em;font-family:'Droid Sans Mono',monospace,sans-serif;white-space:pre-wrap;font-size:1em"><font color="#000000">static int op=0;
static int64_t t_r3[256], t_g3[256], t_b3[256];

static void
videoconvert_convert_matrix8 (VideoConvert * convert, gpointer p)
{
 while (op<2) {   
int i, j;
        int k_r[] = { convert->cmatrix[0][0], convert->cmatrix[1][0], convert->cmatrix[2][0] };
        int k_g[] = { convert->cmatrix[0][1], convert->cmatrix[1][1], convert->cmatrix[2][1] };
        int k_b[] = { convert->cmatrix[0][2], convert->cmatrix[1][2], convert->cmatrix[2][2] };


        for (i = 0; i <= 255; i++)
        {
                int64_t r = 0, g = 0, b = 0;
                for (j = 0; j <= 2; j++)
                {
                        r = (r << 16) + k_r[j] * i;
                        g = (g << 16) + k_g[j] * i;
                        b = (b << 16) + k_b[j] * i;
                }
                t_r3[i] = r;
                t_g3[i] = g;
                t_b3[i] = b;
                op=2;
        }
 }
  int t;
        uint8_t r, g, b;
        guint8 *pixels = p;
        int64_t c = ((int64_t) convert->cmatrix[0][3] << 32)
                        + ((int64_t) convert->cmatrix[1][3] << 16)
                        + ((int64_t) convert->cmatrix[2][3] << 0);

        for (t = 0; t < convert->width * 4; t += 4)
        {
                r = pixels[t + 1];
                g = pixels[t + 2];
                b = pixels[t + 3];

                int64_t x3 = t_r3[r] + t_g3[g] + t_b3[b] + c;

                pixels[t + 1] = x3 >> 40;
                pixels[t + 2] = x3 >> 24;
                pixels[t + 3] = x3 >> 8;
        }
}</font></code><code style="color:rgb(248,248,248);border:0px;margin-top:0.5em;margin-bottom:0.5em;font-family:'Droid Sans Mono',monospace,sans-serif;white-space:pre-wrap;font-size:1em">
</code></pre><pre style="white-space:pre-wrap;word-wrap:break-word;margin:0.5em 1em;font-size:1rem"><code style="border:0px;margin-top:0.5em;margin-bottom:0.5em;font-family:'Droid Sans Mono',monospace,sans-serif;white-space:pre-wrap;font-size:1em"><font color="#000000"><br>
</font></code></pre><div class="gmail_extra">and result on i5-3330 3GHz</div><br><br>x86 stock\table64<br><br>Performance counter stats for process id '2427':<br><br>      24441.388683 task-clock                #    0.555 CPUs utilized          <br>
            16,117 context-switches          #    0.659 K/sec                  <br>               219 cpu-migrations            #    0.009 K/sec                  <br>             5,233 page-faults               #    0.214 K/sec                  <br>
    77,409,381,175 cycles                    #    3.167 GHz                    <br>    26,292,718,337 stalled-cycles-frontend   #   33.97% frontend cycles idle   <br>   <not supported> stalled-cycles-backend  <br>   197,607,036,204 instructions              #    2.55  insns per cycle        <br>
                                             #    0.13  stalled cycles per insn<br>    12,292,242,916 branches                  #  502.927 M/sec                  <br>        21,693,862 branch-misses             #    0.18% of all branches        <br>
   <not supported> L1-dcache-loads:HG      <br>       481,143,745 L1-dcache-load-misses:HG  #    0.00% of all L1-dcache hits  <br>       239,777,528 LLC-loads:HG              #    9.810 M/sec                  <br>   <not supported> LLC-load-misses:HG      <br>
<br>      44.023759316 seconds time elapsed<br><br><br><br>Performance counter stats for process id '2822':<br><br>      16221.909026 task-clock                #    0.339 CPUs utilized          <br>            15,932 context-switches          #    0.982 K/sec                  <br>
             1,189 cpu-migrations            #    0.073 K/sec                  <br>             3,315 page-faults               #    0.204 K/sec                  <br>    51,320,206,381 cycles                    #    3.164 GHz                    <br>
    13,246,360,560 stalled-cycles-frontend   #   25.81% frontend cycles idle   <br>   <not supported> stalled-cycles-backend  <br>   127,814,464,535 instructions              #    2.49  insns per cycle        <br>                                             #    0.10  stalled cycles per insn<br>
     5,902,096,929 branches                  #  363.835 M/sec                  <br>        21,703,222 branch-misses             #    0.37% of all branches        <br>   <not supported> L1-dcache-loads:HG      <br>       608,012,138 L1-dcache-load-misses:HG  #    0.00% of all L1-dcache hits  <br>
       250,741,676 LLC-loads:HG              #   15.457 M/sec                  <br>   <not supported> LLC-load-misses:HG      <br><br>      47.831511910 seconds time elapsed<br><br><br>x86_64 stock\table64<br><br>Performance counter stats for process id '3506':<br>
<br>      23258.800974 task-clock                #    0.455 CPUs utilized          <br>            18,130 context-switches          #    0.779 K/sec                  <br>             2,288 cpu-migrations            #    0.098 K/sec                  <br>
             1,331 page-faults               #    0.057 K/sec                  <br>    73,692,193,376 cycles                    #    3.168 GHz                    <br>    21,218,974,690 stalled-cycles-frontend   #   28.79% frontend cycles idle   <br>
   <not supported> stalled-cycles-backend  <br>   198,710,985,363 instructions              #    2.70  insns per cycle        <span class=""><br>                                             #    0.11  stalled cycles per insn<br></span>    14,252,859,241 branches                  #  612.794 M/sec                  <br>
        22,748,214 branch-misses             #    0.16% of all branches        <br>   <not supported> L1-dcache-loads:HG      <br>       566,065,245 L1-dcache-load-misses:HG  #    0.00% of all L1-dcache hits  <br>       267,280,364 LLC-loads:HG              #   11.492 M/sec                  <br>
   <not supported> LLC-load-misses:HG      <br><br>      51.148678220 seconds time elapsed<br>      <br>Performance counter stats for process id '3379':                                                                                                                                   <br>
                                                                                                                                                                                    <br>      12461.157685 task-clock                #    0.250 CPUs utilized  <br>
             5,485 context-switches          #    0.001 M/sec       <br>               238 cpu-migrations            #    0.019 K/sec                                                                <br>             8,634 page-faults               #    0.693 K/sec                                                                                                  <br>
    39,284,950,355 cycles                    #    3.153 GHz                                                                                     <br>     8,891,423,340 stalled-cycles-frontend   #   22.63% frontend cycles idle                                                                                <br>
   <not supported> stalled-cycles-backend                                                                                                         <br>   105,668,751,265 instructions              #    2.69  insns per cycle                                                                                                <br>
                                             #    0.08  stalled cycles per insn<br>     6,123,961,551 branches                  #  491.444 M/sec                   <br>        20,472,272 branch-misses             #    0.33% of all branches         <br>
   <not supported> L1-dcache-loads:HG      <br>       608,576,686 L1-dcache-load-misses:HG  #    0.00% of all L1-dcache hits<br>       259,895,252 LLC-loads:HG              #   20.856 M/sec          <br>   <not supported> LLC-load-misses:HG      <br>
<br>      49.792876350 seconds time elapsed<div><div class="h5"><br><br><br><br><br><br><div class="gmail_quote">2014-06-21 20:30 GMT+00:00 Yaroslav Andrusyak <span dir="ltr"><<a href="mailto:pontostroy@gmail.com" target="_blank">pontostroy@gmail.com</a>></span>:<br>
<blockquote class="gmail_quote" style="margin:0px 0px 0px 0.8ex;border-left-width:1px;border-left-color:rgb(204,204,204);border-left-style:solid;padding-left:1ex"><div dir="ltr">One good man gave me a  sse3 version of videoconvert_convert_matrix8 (4-7 times faster)<br>
<br><div><div>Samples: 198K of event 'cycles', Event count (approx.): 112390215455                                                                                                            </div>
<div> 31.71%  libx264.so.142               [.] x264_add8x8_idct_avx2.skip_prologue</div><div> 19.26%  libx264.so.142               [.] 0x00000000000951bc</div><div> 10.58%  libx264.so.142               [.] x264_add8x8_idct_avx.skip_prologue</div>

<div>  3.71%  libgstvideoconvert.so        [.] videoconvert_convert_matrix8         (3.71% vs <span style="font-family:arial,sans-serif;font-size:13px"> 23.51%</span>)</div><div>  2.83%  libx264.so.142               [.] x264_me_search_ref</div>

<div>  2.82%  orcexec.eWcXD2               [.] 0x0000000000000284</div><div>  2.64%  <a href="http://libc-2.18.so" target="_blank">libc-2.18.so</a>                 [.] __memcpy_sse2_unaligned</div><div>  2.24%  libgstvideo-1.0.so.0.204.0   [.] video_chroma_down_v2_guint8</div>

<div>  2.22%  libgstvideo-1.0.so.0.204.0   [.] video_chroma_down_h2_guint8</div><div>  2.20%  libx264.so.142               [.] x264_macroblock_cache_load_progressive</div><div>  1.95%  libx264.so.142               [.] x264_sub8x8_dct_avx.skip_prologue</div>

<div>  1.55%  libx264.so.142               [.] x264_macroblock_analyse</div><div>  0.97%  libx264.so.142               [.] x264_macroblock_encode</div><div>  0.94%  libx264.so.142               [.] x264_macroblock_cache_save</div>

<div>  0.90%  libx264.so.142               [.] x264_mb_predict_mv_direct16x16</div><div>  0.83%  libx264.so.142               [.] x264_mb_predict_mv_ref16x16</div><div>  0.62%  perf                         [.] 0x0000000000067844</div>

<div>  0.59%  libx264.so.142               [.] x264_mb_encode_chroma</div><div>  0.39%  libx264.so.142               [.] x264_macroblock_probe_skip</div></div><div><br></div><div><br></div><div>it is possible add videoconvert_convert_matrix8_sse in upstream?<div>
<br>
<br>static void<br>videoconvert_convert_matrix8 (VideoConvert * convert, gpointer pixels)<br>{<br></div>        int i,j;<br>        guint8 *p = pixels;<br> <br>        __m128i v_byte1 = _mm_set1_epi32(0x000000ff);<br>        __m128i v_byte3 = _mm_set1_epi32(0x00ff0000);<br>

        __m128i v_mat_00 = _mm_set1_epi16((short int)convert->cmatrix[0][0]);<br>        __m128i v_mat_01 = _mm_set1_epi16((short int)convert->cmatrix[0][1]);<br>        __m128i v_mat_02 = _mm_set1_epi16((short int)convert->cmatrix[0][2]);<br>

        __m128i v_mat_03 = _mm_set1_epi16((short int)convert->cmatrix[0][3]);<br>        __m128i v_mat_04 = _mm_set1_epi16((short int)convert->cmatrix[1][0]);<br>        __m128i v_mat_05 = _mm_set1_epi16((short int)convert->cmatrix[1][1]);<br>

        __m128i v_mat_06 = _mm_set1_epi16((short int)convert->cmatrix[1][2]);<br>        __m128i v_mat_07 = _mm_set1_epi16((short int)convert->cmatrix[1][3]);<br>        __m128i v_mat_08 = _mm_set1_epi16((short int)convert->cmatrix[2][0]);<br>

        __m128i v_mat_09 = _mm_set1_epi16((short int)convert->cmatrix[2][1]);<br>        __m128i v_mat_10 = _mm_set1_epi16((short int)convert->cmatrix[2][2]);<br>        __m128i v_mat_11 = _mm_set1_epi16((short int)convert->cmatrix[2][3]);<br>

        <br>        __m128i mask2   = _mm_set1_epi32(0x00ff00ff);<br> <br>        __m128i mask_y1 = _mm_set_epi8((char)128, (char)128, 12, (char)128,   (char)128, (char)128, 8, (char)128,<br>                                        (char)128, (char)128, 4, (char)128,   (char)128, (char)128, 0, (char)128);<br>

 <br>        __m128i mask_y2 = _mm_set_epi8((char)128, (char)128, 14,  (char)128,  (char)128, (char)128, 10, (char)128,<br>                                        (char)128, (char)128, 6, (char)128,   (char)128, (char)128, 2, (char)128);<br>

 <br>        __m128i mask_u1 = _mm_set_epi8((char)128, 12, (char)128, (char)128,   (char)128, 8, (char)128, (char)128,<br>                                        (char)128, 4, (char)128, (char)128,   (char)128, 0, (char)128, (char)128);<br>

 <br>        __m128i mask_u2 = _mm_set_epi8((char)128, 14, (char)128, (char)128,   (char)128, 10, (char)128, (char)128,<br>                                        (char)128, 6, (char)128, (char)128,   (char)128, 2, (char)128, (char)128);<br>

 <br>        __m128i mask_v1 = _mm_set_epi8(12, (char)128, (char)128, (char)128,   8, (char)128, (char)128, (char)128,<br>                                        4, (char)128, (char)128, (char)128,   0, (char)128, (char)128, (char)128);<br>

 <br>        __m128i mask_v2 = _mm_set_epi8(14, (char)128, (char)128, (char)128,   10, (char)128, (char)128, (char)128,<br>                                        6, (char)128, (char)128, (char)128,   2, (char)128, (char)128, (char)128);<br>

 <br>        <br>        for (i=0; i<convert->width / 8; i++) {<br>                __m128i a1, a2, r, g, b, y, u, v, res;<br> <br>                a1 = _mm_loadu_si128((__m128i *)&p[i*32]);<br>                a2 = _mm_loadu_si128((__m128i *)&p[i*32 + 16]);<br>

 <br>                r = _mm_or_si128(_mm_and_si128(_mm_srli_si128(a1, 1), v_byte1), _mm_and_si128(_mm_slli_si128(a2, 1), v_byte3));<br>                g = _mm_or_si128(_mm_and_si128(_mm_srli_si128(a1, 2), v_byte1), _mm_and_si128(a2, v_byte3));<br>

                b = _mm_or_si128(_mm_and_si128(_mm_srli_si128(a1, 3), v_byte1), _mm_and_si128(_mm_srli_si128(a2, 1), v_byte3));<br> <br> <br>                y = _mm_add_epi16(<br>                        _mm_add_epi16(<br>

                                _mm_mullo_epi16(r, v_mat_00),<br>                                _mm_mullo_epi16(g, v_mat_01)),<br>                        _mm_add_epi16(<br>                                _mm_mullo_epi16(b, v_mat_02),<br>

                                v_mat_03));<br> <br>                y = _mm_and_si128(_mm_srai_epi16(y, 8), mask2);<br> <br>                u = _mm_add_epi16(<br>                        _mm_add_epi16(<br>                                _mm_mullo_epi16(r, v_mat_04),<br>

                                _mm_mullo_epi16(g, v_mat_05)),<br>                        _mm_add_epi16(<br>                                _mm_mullo_epi16(b, v_mat_06),<br>                                v_mat_07));<br>
 <br>
                u  = _mm_and_si128(_mm_srai_epi16(u, 8), mask2);<br> <br>                v = _mm_add_epi16(<br>                        _mm_add_epi16(<br>                                _mm_mullo_epi16(r, v_mat_08),<br>                                _mm_mullo_epi16(g, v_mat_09)),<br>

                        _mm_add_epi16(<br>                                _mm_mullo_epi16(b, v_mat_10),<br>                                v_mat_11));<br> <br>                v = _mm_and_si128(_mm_srai_epi16(v, 8), mask2);<br>

 <br> <br>                res = _mm_or_si128(_mm_shuffle_epi8(y, mask_y1), _mm_shuffle_epi8(u, mask_u1));<br>                res = _mm_or_si128(res, _mm_shuffle_epi8(v, mask_v1));<br> <br>                _mm_storeu_si128((__m128i *)&p[i*32], res);<br>

 <br>                res = _mm_or_si128(_mm_shuffle_epi8(y, mask_y2), _mm_shuffle_epi8(u, mask_u2));<br>               res = _mm_or_si128(res, _mm_shuffle_epi8(v, mask_v2));<br> <br>                _mm_storeu_si128((__m128i *)&p[i*32 + 16], res);<br>

        }<br>}<br></div></div>
</blockquote></div><br></div></div></div></div>
</blockquote></div><br></div>