Videoconvert needs to be optimized

Sun Jun 22 22:49:11 PDT 2014

Other multi-platform version use  tables instead of multipl and all
components in one 64-bit number

static int op=0;
static int64_t t_r3[256], t_g3[256], t_b3[256];

static void
videoconvert_convert_matrix8 (VideoConvert * convert, gpointer p)
{
 while (op<2) {
int i, j;
	int k_r[] = { convert->cmatrix[0][0], convert->cmatrix[1][0],
convert->cmatrix[2][0] };
	int k_g[] = { convert->cmatrix[0][1], convert->cmatrix[1][1],
convert->cmatrix[2][1] };
	int k_b[] = { convert->cmatrix[0][2], convert->cmatrix[1][2],
convert->cmatrix[2][2] };

	for (i = 0; i <= 255; i++)
	{
		int64_t r = 0, g = 0, b = 0;
		for (j = 0; j <= 2; j++)
		{
			r = (r << 16) + k_r[j] * i;
			g = (g << 16) + k_g[j] * i;
			b = (b << 16) + k_b[j] * i;
		}
		t_r3[i] = r;
		t_g3[i] = g;
		t_b3[i] = b;
		op=2;
	}
 }
  int t;
	uint8_t r, g, b;
	guint8 *pixels = p;
	int64_t c = ((int64_t) convert->cmatrix[0][3] << 32)
			+ ((int64_t) convert->cmatrix[1][3] << 16)
			+ ((int64_t) convert->cmatrix[2][3] << 0);

	for (t = 0; t < convert->width * 4; t += 4)
	{
		r = pixels[t + 1];
		g = pixels[t + 2];
		b = pixels[t + 3];

		int64_t x3 = t_r3[r] + t_g3[g] + t_b3[b] + c;

		pixels[t + 1] = x3 >> 40;
		pixels[t + 2] = x3 >> 24;
		pixels[t + 3] = x3 >> 8;
	}
}

and result on i5-3330 3GHz

x86 stock\table64

Performance counter stats for process id '2427':

      24441.388683 task-clock                #    0.555 CPUs utilized

            16,117 context-switches          #    0.659 K/sec

               219 cpu-migrations            #    0.009 K/sec

             5,233 page-faults               #    0.214 K/sec

    77,409,381,175 cycles                    #    3.167 GHz

    26,292,718,337 stalled-cycles-frontend   #   33.97% frontend cycles
idle
   <not supported> stalled-cycles-backend
   197,607,036,204 instructions              #    2.55  insns per cycle

                                             #    0.13  stalled cycles per
insn
    12,292,242,916 branches                  #  502.927 M/sec

        21,693,862 branch-misses             #    0.18% of all branches

   <not supported> L1-dcache-loads:HG
       481,143,745 L1-dcache-load-misses:HG  #    0.00% of all L1-dcache
hits
       239,777,528 LLC-loads:HG              #    9.810 M/sec

   <not supported> LLC-load-misses:HG

      44.023759316 seconds time elapsed

Performance counter stats for process id '2822':

      16221.909026 task-clock                #    0.339 CPUs utilized

            15,932 context-switches          #    0.982 K/sec

             1,189 cpu-migrations            #    0.073 K/sec

             3,315 page-faults               #    0.204 K/sec

    51,320,206,381 cycles                    #    3.164 GHz

    13,246,360,560 stalled-cycles-frontend   #   25.81% frontend cycles
idle
   <not supported> stalled-cycles-backend
   127,814,464,535 instructions              #    2.49  insns per cycle

                                             #    0.10  stalled cycles per
insn
     5,902,096,929 branches                  #  363.835 M/sec

        21,703,222 branch-misses             #    0.37% of all branches

   <not supported> L1-dcache-loads:HG
       608,012,138 L1-dcache-load-misses:HG  #    0.00% of all L1-dcache
hits
       250,741,676 LLC-loads:HG              #   15.457 M/sec

   <not supported> LLC-load-misses:HG

      47.831511910 seconds time elapsed

x86_64 stock\table64

Performance counter stats for process id '3506':

      23258.800974 task-clock                #    0.455 CPUs utilized

            18,130 context-switches          #    0.779 K/sec

             2,288 cpu-migrations            #    0.098 K/sec

             1,331 page-faults               #    0.057 K/sec

    73,692,193,376 cycles                    #    3.168 GHz

    21,218,974,690 stalled-cycles-frontend   #   28.79% frontend cycles
idle
   <not supported> stalled-cycles-backend
   198,710,985,363 instructions              #    2.70  insns per cycle

                                             #    0.11  stalled cycles per
insn
    14,252,859,241 branches                  #  612.794 M/sec

        22,748,214 branch-misses             #    0.16% of all branches

   <not supported> L1-dcache-loads:HG
       566,065,245 L1-dcache-load-misses:HG  #    0.00% of all L1-dcache
hits
       267,280,364 LLC-loads:HG              #   11.492 M/sec

   <not supported> LLC-load-misses:HG

      51.148678220 seconds time elapsed

Performance counter stats for process id '3379':

      12461.157685 task-clock                #    0.250 CPUs utilized
             5,485 context-switches          #    0.001 M/sec
               238 cpu-migrations            #    0.019 K/sec

             8,634 page-faults               #    0.693 K/sec

    39,284,950,355 cycles                    #    3.153 GHz

     8,891,423,340 stalled-cycles-frontend   #   22.63% frontend cycles
idle

   <not supported> stalled-cycles-backend

   105,668,751,265 instructions              #    2.69  insns per cycle

                                             #    0.08  stalled cycles per
insn
     6,123,961,551 branches                  #  491.444 M/sec

        20,472,272 branch-misses             #    0.33% of all branches

   <not supported> L1-dcache-loads:HG
       608,576,686 L1-dcache-load-misses:HG  #    0.00% of all L1-dcache
hits
       259,895,252 LLC-loads:HG              #   20.856 M/sec
   <not supported> LLC-load-misses:HG

      49.792876350 seconds time elapsed

2014-06-21 20:30 GMT+00:00 Yaroslav Andrusyak <pontostroy at gmail.com>:

> One good man gave me a  sse3 version of videoconvert_convert_matrix8 (4-7
> times faster)
>
> Samples: 198K of event 'cycles', Event count (approx.): 112390215455
>
>
>  31.71%  libx264.so.142               [.]
> x264_add8x8_idct_avx2.skip_prologue
>  19.26%  libx264.so.142               [.] 0x00000000000951bc
>  10.58%  libx264.so.142               [.]
> x264_add8x8_idct_avx.skip_prologue
>   3.71%  libgstvideoconvert.so        [.] videoconvert_convert_matrix8
>     (3.71% vs  23.51%)
>   2.83%  libx264.so.142               [.] x264_me_search_ref
>   2.82%  orcexec.eWcXD2               [.] 0x0000000000000284
>   2.64%  libc-2.18.so                 [.] __memcpy_sse2_unaligned
>   2.24%  libgstvideo-1.0.so.0.204.0   [.] video_chroma_down_v2_guint8
>   2.22%  libgstvideo-1.0.so.0.204.0   [.] video_chroma_down_h2_guint8
>   2.20%  libx264.so.142               [.]
> x264_macroblock_cache_load_progressive
>   1.95%  libx264.so.142               [.] x264_sub8x8_dct_avx.skip_prologue
>   1.55%  libx264.so.142               [.] x264_macroblock_analyse
>   0.97%  libx264.so.142               [.] x264_macroblock_encode
>   0.94%  libx264.so.142               [.] x264_macroblock_cache_save
>   0.90%  libx264.so.142               [.] x264_mb_predict_mv_direct16x16
>   0.83%  libx264.so.142               [.] x264_mb_predict_mv_ref16x16
>   0.62%  perf                         [.] 0x0000000000067844
>   0.59%  libx264.so.142               [.] x264_mb_encode_chroma
>   0.39%  libx264.so.142               [.] x264_macroblock_probe_skip
>
>
> it is possible add videoconvert_convert_matrix8_sse in upstream?
>
>
> static void
> videoconvert_convert_matrix8 (VideoConvert * convert, gpointer pixels)
> {
>         int i,j;
>         guint8 *p = pixels;
>
>         __m128i v_byte1 = _mm_set1_epi32(0x000000ff);
>         __m128i v_byte3 = _mm_set1_epi32(0x00ff0000);
>         __m128i v_mat_00 = _mm_set1_epi16((short
> int)convert->cmatrix[0][0]);
>         __m128i v_mat_01 = _mm_set1_epi16((short
> int)convert->cmatrix[0][1]);
>         __m128i v_mat_02 = _mm_set1_epi16((short
> int)convert->cmatrix[0][2]);
>         __m128i v_mat_03 = _mm_set1_epi16((short
> int)convert->cmatrix[0][3]);
>         __m128i v_mat_04 = _mm_set1_epi16((short
> int)convert->cmatrix[1][0]);
>         __m128i v_mat_05 = _mm_set1_epi16((short
> int)convert->cmatrix[1][1]);
>         __m128i v_mat_06 = _mm_set1_epi16((short
> int)convert->cmatrix[1][2]);
>         __m128i v_mat_07 = _mm_set1_epi16((short
> int)convert->cmatrix[1][3]);
>         __m128i v_mat_08 = _mm_set1_epi16((short
> int)convert->cmatrix[2][0]);
>         __m128i v_mat_09 = _mm_set1_epi16((short
> int)convert->cmatrix[2][1]);
>         __m128i v_mat_10 = _mm_set1_epi16((short
> int)convert->cmatrix[2][2]);
>         __m128i v_mat_11 = _mm_set1_epi16((short
> int)convert->cmatrix[2][3]);
>
>         __m128i mask2   = _mm_set1_epi32(0x00ff00ff);
>
>         __m128i mask_y1 = _mm_set_epi8((char)128, (char)128, 12,
> (char)128,   (char)128, (char)128, 8, (char)128,
>                                         (char)128, (char)128, 4,
> (char)128,   (char)128, (char)128, 0, (char)128);
>
>         __m128i mask_y2 = _mm_set_epi8((char)128, (char)128, 14,
>  (char)128,  (char)128, (char)128, 10, (char)128,
>                                         (char)128, (char)128, 6,
> (char)128,   (char)128, (char)128, 2, (char)128);
>
>         __m128i mask_u1 = _mm_set_epi8((char)128, 12, (char)128,
> (char)128,   (char)128, 8, (char)128, (char)128,
>                                         (char)128, 4, (char)128,
> (char)128,   (char)128, 0, (char)128, (char)128);
>
>         __m128i mask_u2 = _mm_set_epi8((char)128, 14, (char)128,
> (char)128,   (char)128, 10, (char)128, (char)128,
>                                         (char)128, 6, (char)128,
> (char)128,   (char)128, 2, (char)128, (char)128);
>
>         __m128i mask_v1 = _mm_set_epi8(12, (char)128, (char)128,
> (char)128,   8, (char)128, (char)128, (char)128,
>                                         4, (char)128, (char)128,
> (char)128,   0, (char)128, (char)128, (char)128);
>
>         __m128i mask_v2 = _mm_set_epi8(14, (char)128, (char)128,
> (char)128,   10, (char)128, (char)128, (char)128,
>                                         6, (char)128, (char)128,
> (char)128,   2, (char)128, (char)128, (char)128);
>
>
>         for (i=0; i<convert->width / 8; i++) {
>                 __m128i a1, a2, r, g, b, y, u, v, res;
>
>                 a1 = _mm_loadu_si128((__m128i *)&p[i*32]);
>                 a2 = _mm_loadu_si128((__m128i *)&p[i*32 + 16]);
>
>                 r = _mm_or_si128(_mm_and_si128(_mm_srli_si128(a1, 1),
> v_byte1), _mm_and_si128(_mm_slli_si128(a2, 1), v_byte3));
>                 g = _mm_or_si128(_mm_and_si128(_mm_srli_si128(a1, 2),
> v_byte1), _mm_and_si128(a2, v_byte3));
>                 b = _mm_or_si128(_mm_and_si128(_mm_srli_si128(a1, 3),
> v_byte1), _mm_and_si128(_mm_srli_si128(a2, 1), v_byte3));
>
>
>                 y = _mm_add_epi16(
>                         _mm_add_epi16(
>                                 _mm_mullo_epi16(r, v_mat_00),
>                                 _mm_mullo_epi16(g, v_mat_01)),
>                         _mm_add_epi16(
>                                 _mm_mullo_epi16(b, v_mat_02),
>                                 v_mat_03));
>
>                 y = _mm_and_si128(_mm_srai_epi16(y, 8), mask2);
>
>                 u = _mm_add_epi16(
>                         _mm_add_epi16(
>                                 _mm_mullo_epi16(r, v_mat_04),
>                                 _mm_mullo_epi16(g, v_mat_05)),
>                         _mm_add_epi16(
>                                 _mm_mullo_epi16(b, v_mat_06),
>                                 v_mat_07));
>
>                 u  = _mm_and_si128(_mm_srai_epi16(u, 8), mask2);
>
>                 v = _mm_add_epi16(
>                         _mm_add_epi16(
>                                 _mm_mullo_epi16(r, v_mat_08),
>                                 _mm_mullo_epi16(g, v_mat_09)),
>                         _mm_add_epi16(
>                                 _mm_mullo_epi16(b, v_mat_10),
>                                 v_mat_11));
>
>                 v = _mm_and_si128(_mm_srai_epi16(v, 8), mask2);
>
>
>                 res = _mm_or_si128(_mm_shuffle_epi8(y, mask_y1),
> _mm_shuffle_epi8(u, mask_u1));
>                 res = _mm_or_si128(res, _mm_shuffle_epi8(v, mask_v1));
>
>                 _mm_storeu_si128((__m128i *)&p[i*32], res);
>
>                 res = _mm_or_si128(_mm_shuffle_epi8(y, mask_y2),
> _mm_shuffle_epi8(u, mask_u2));
>                res = _mm_or_si128(res, _mm_shuffle_epi8(v, mask_v2));
>
>                 _mm_storeu_si128((__m128i *)&p[i*32 + 16], res);
>         }
> }
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.freedesktop.org/archives/gstreamer-devel/attachments/20140623/2e4f8275/attachment-0001.html>