[igt-dev] [PATCH i-g-t 4/8] lib: Use igt_matrix for ycbcr<->rgb conversion

Wed Jun 6 10:11:41 UTC 2018

Op 23-05-18 om 20:31 schreef Ville Syrjala:
> From: Ville Syrjälä <ville.syrjala at linux.intel.com>
>
> Probably horribly inefficient (not that the original code tried to be
> particularly efficient), but at least this is now pretty generic so
> it'll be super easy to add other color encodings and whatnot.
Hey,

Conversion routines taken out (0s converting):
real    0m22,732s

Unpatched kms_nv12 run:
real    0m27,044s (4.3s converting)

Patched kms_nv12 run:
real    0m37,778s (15s converting)

It's not just making things worse, it's making it 3.5x slower. :/

> v2: Rebase
> v3: Deal with the new color_encoding/range enums
> v4: Fix the code to actually work, and do things in 2x2 blocks
>     Keep the chroma siting comment and pimp it up a bit
>
> Signed-off-by: Ville Syrjälä <ville.syrjala at linux.intel.com>
> ---
>  lib/igt_fb.c | 228 +++++++++++++++++++++++++++++++++++++++++------------------
>  1 file changed, 159 insertions(+), 69 deletions(-)
>
> diff --git a/lib/igt_fb.c b/lib/igt_fb.c
> index 9edee74b57c3..ceaaea70f64b 100644
> --- a/lib/igt_fb.c
> +++ b/lib/igt_fb.c
> @@ -31,8 +31,10 @@
>  
>  #include "drmtest.h"
>  #include "igt_aux.h"
> +#include "igt_color_encoding.h"
>  #include "igt_fb.h"
>  #include "igt_kms.h"
> +#include "igt_matrix.h"
>  #include "igt_x86.h"
>  #include "ioctl_wrappers.h"
>  #include "intel_batchbuffer.h"
> @@ -1356,6 +1358,21 @@ static uint8_t clamprgb(float val)
>  	return clamp(val, 0.0f, 255.0f);
>  }
>  
> +static void read_rgb(struct igt_vec4 *rgb, const uint8_t *rgb24)
> +{
> +	rgb->d[0] = rgb24[2];
> +	rgb->d[1] = rgb24[1];
> +	rgb->d[2] = rgb24[0];
> +	rgb->d[3] = 1.0f;
> +}
> +
> +static void write_rgb(uint8_t *rgb24, const struct igt_vec4 *rgb)
> +{
> +	rgb24[2] = clamprgb(rgb->d[0]);
> +	rgb24[1] = clamprgb(rgb->d[1]);
> +	rgb24[0] = clamprgb(rgb->d[2]);
> +}
> +
>  static void convert_nv12_to_rgb24(struct igt_fb *fb, struct fb_convert_blit_upload *blit)
>  {
>  	int i, j;
> @@ -1363,6 +1380,8 @@ static void convert_nv12_to_rgb24(struct igt_fb *fb, struct fb_convert_blit_uplo
>  	uint8_t *rgb24 = blit->rgb24.map;
>  	unsigned rgb24_stride = blit->rgb24.stride, planar_stride = blit->linear.stride;
>  	uint8_t *buf = malloc(blit->linear.size);
> +	struct igt_mat4 m = igt_ycbcr_to_rgb_matrix(IGT_COLOR_YCBCR_BT601,
> +						    IGT_COLOR_YCBCR_LIMITED_RANGE);
>  
>  	/*
>  	 * Reading from the BO is awfully slow because of lack of read caching,
> @@ -1373,30 +1392,49 @@ static void convert_nv12_to_rgb24(struct igt_fb *fb, struct fb_convert_blit_uplo
>  	y = &buf[blit->linear.offsets[0]];
>  	uv = &buf[blit->linear.offsets[1]];
>  
> -	/* Convert from limited color range BT.601 */
>  	for (i = 0; i < fb->height / 2; i++) {
> -		for (j = 0; j < fb->width; j++) {
> -			float r_, g_, b_, y0, y1, cb, cr;
> -			/* Convert 1x2 pixel blocks */
> -
> -			y0 = 1.164f * (y[j] - 16.f);
> -			y1 = 1.164f * (y[j + planar_stride] - 16.f);
> +		for (j = 0; j < fb->width / 2; j++) {
> +			/* Convert 2x2 pixel blocks */
> +			struct igt_vec4 yuv[4];
> +			struct igt_vec4 rgb[4];
> +
> +			yuv[0].d[0] = y[j * 2 + 0];
> +			yuv[1].d[0] = y[j * 2 + 1];
> +			yuv[2].d[0] = y[j * 2 + 0 + planar_stride];
> +			yuv[3].d[0] = y[j * 2 + 1 + planar_stride];
> +
> +			yuv[0].d[1] = yuv[1].d[1] = yuv[2].d[1] = yuv[3].d[1] = uv[j * 2 + 0];
> +			yuv[0].d[2] = yuv[1].d[2] = yuv[2].d[2] = yuv[3].d[2] = uv[j * 2 + 1];
> +			yuv[0].d[3] = yuv[1].d[3] = yuv[2].d[3] = yuv[3].d[3] = 1.0f;
> +
> +			rgb[0] = igt_matrix_transform(&m, &yuv[0]);
> +			rgb[1] = igt_matrix_transform(&m, &yuv[1]);
> +			rgb[2] = igt_matrix_transform(&m, &yuv[2]);
> +			rgb[3] = igt_matrix_transform(&m, &yuv[3]);
> +
> +			write_rgb(&rgb24[j * 8 + 0], &rgb[0]);
> +			write_rgb(&rgb24[j * 8 + 4], &rgb[1]);
> +			write_rgb(&rgb24[j * 8 + 0 + rgb24_stride], &rgb[2]);
> +			write_rgb(&rgb24[j * 8 + 4 + rgb24_stride], &rgb[3]);
> +		}
>  
> -			cb = uv[j & ~1] - 128.f;
> -			cr = uv[j | 1] - 128.f;
> +		if (fb->width & 1) {
> +			/* Convert 1x2 pixel block */
> +			struct igt_vec4 yuv[2];
> +			struct igt_vec4 rgb[2];
>  
> -			r_ =  0.000f * cb +  1.596f * cr;
> -			g_ = -0.392f * cb + -0.813f * cr;
> -			b_ =  2.017f * cb +  0.000f * cr;
> +			yuv[0].d[0] = y[j * 2 + 0];
> +			yuv[1].d[0] = y[j * 2 + 0 + planar_stride];
>  
> -			rgb24[j * 4 + 2] = clamprgb(y0 + r_);
> -			rgb24[j * 4 + 2 + rgb24_stride] = clamprgb(y1 + r_);
> +			yuv[0].d[1] = yuv[1].d[1] = uv[j * 2 + 0];
> +			yuv[0].d[2] = yuv[1].d[2] = uv[j * 2 + 1];
> +			yuv[0].d[3] = yuv[1].d[3] = 1.0f;
>  
> -			rgb24[j * 4 + 1] = clamprgb(y0 + g_);
> -			rgb24[j * 4 + 1 + rgb24_stride] = clamprgb(y1 + g_);
> +			rgb[0] = igt_matrix_transform(&m, &yuv[0]);
> +			rgb[1] = igt_matrix_transform(&m, &yuv[1]);
>  
> -			rgb24[j * 4] = clamprgb(y0 + b_);
> -			rgb24[j * 4 + rgb24_stride] = clamprgb(y1 + b_);
> +			write_rgb(&rgb24[j * 8 + 0], &rgb[0]);
> +			write_rgb(&rgb24[j * 8 + 0 + rgb24_stride], &rgb[1]);
>  		}
>  
>  		rgb24 += 2 * rgb24_stride;
> @@ -1406,21 +1444,37 @@ static void convert_nv12_to_rgb24(struct igt_fb *fb, struct fb_convert_blit_uplo
>  
>  	if (fb->height & 1) {
>  		/* Convert last row */
> -		for (j = 0; j < fb->width; j++) {
> -			float r_, g_, b_, y0, cb, cr;
> +		for (j = 0; j < fb->width / 2; j++) {
> +			/* Convert 2x1 pixel blocks */
> +			struct igt_vec4 yuv[2];
> +			struct igt_vec4 rgb[2];
> +
> +			yuv[0].d[0] = y[j * 2 + 0];
> +			yuv[1].d[0] = y[j * 2 + 1];
> +			yuv[0].d[1] = yuv[1].d[1] = uv[j * 2 + 0];
> +			yuv[0].d[2] = yuv[1].d[2] = uv[j * 2 + 1];
> +			yuv[0].d[3] = yuv[1].d[3] = 1.0f;
> +
> +			rgb[0] = igt_matrix_transform(&m, &yuv[0]);
> +			rgb[1] = igt_matrix_transform(&m, &yuv[1]);
> +
> +			write_rgb(&rgb24[j * 8 + 0], &rgb[0]);
> +			write_rgb(&rgb24[j * 8 + 4], &rgb[0]);
> +		}
> +
> +		if (fb->width & 1) {
>  			/* Convert single pixel */
> +			struct igt_vec4 yuv;
> +			struct igt_vec4 rgb;
>  
> -			cb = uv[j & ~1] - 128.f;
> -			cr = uv[j | 1] - 128.f;
> +			yuv.d[0] = y[j * 2 + 0];
> +			yuv.d[1] = uv[j * 2 + 0];
> +			yuv.d[2] = uv[j * 2 + 1];
> +			yuv.d[3] = 1.0f;
>  
> -			y0 = 1.164f * (y[j] - 16.f);
> -			r_ =  0.000f * cb +  1.596f * cr;
> -			g_ = -0.392f * cb + -0.813f * cr;
> -			b_ =  2.017f * cb +  0.000f * cr;
> +			rgb = igt_matrix_transform(&m, &yuv);
>  
> -			rgb24[j * 4 + 2] = clamprgb(y0 + r_);
> -			rgb24[j * 4 + 1] = clamprgb(y0 + g_);
> -			rgb24[j * 4] = clamprgb(y0 + b_);
> +			write_rgb(&rgb24[j * 8 + 0], &rgb);
>  		}
>  	}
>  
> @@ -1435,65 +1489,101 @@ static void convert_rgb24_to_nv12(struct igt_fb *fb, struct fb_convert_blit_uplo
>  	const uint8_t *rgb24 = blit->rgb24.map;
>  	unsigned rgb24_stride = blit->rgb24.stride;
>  	unsigned planar_stride = blit->linear.stride;
> +	struct igt_mat4 m = igt_rgb_to_ycbcr_matrix(IGT_COLOR_YCBCR_BT601,
> +						    IGT_COLOR_YCBCR_LIMITED_RANGE);
>  
>  	igt_assert_f(fb->drm_format == DRM_FORMAT_NV12,
>  		     "Conversion not implemented for !NV12 planar formats\n");
>  
> -	for (i = 0; i < fb->plane_height[0]; i++) {
> -		/* Use limited color range BT.601 */
> -
> -		for (j = 0; j < fb->plane_width[0]; j++) {
> -			float yf = 0.257f * rgb24[j * 4 + 2] +
> -				   0.504f * rgb24[j * 4 + 1] +
> -				   0.098f * rgb24[j * 4] + 16;
> +	for (i = 0; i < fb->height / 2; i++) {
> +		for (j = 0; j < fb->width / 2; j++) {
> +			/* Convert 2x2 pixel blocks */
> +			struct igt_vec4 rgb[4];
> +			struct igt_vec4 yuv[4];
> +
> +			read_rgb(&rgb[0], &rgb24[j * 8 + 0]);
> +			read_rgb(&rgb[1], &rgb24[j * 8 + 4]);
> +			read_rgb(&rgb[2], &rgb24[j * 8 + 0 + rgb24_stride]);
> +			read_rgb(&rgb[3], &rgb24[j * 8 + 4 + rgb24_stride]);
> +
> +			yuv[0] = igt_matrix_transform(&m, &rgb[0]);
> +			yuv[1] = igt_matrix_transform(&m, &rgb[1]);
> +			yuv[2] = igt_matrix_transform(&m, &rgb[2]);
> +			yuv[3] = igt_matrix_transform(&m, &rgb[3]);
> +
> +			y[j * 2 + 0] = yuv[0].d[0];
> +			y[j * 2 + 1] = yuv[1].d[0];
> +			y[j * 2 + 0 + planar_stride] = yuv[2].d[0];
> +			y[j * 2 + 1 + planar_stride] = yuv[3].d[0];
>  
> -			y[j] = (uint8_t)yf;
> +			/*
> +			 * We assume the MPEG2 chroma siting convention, where
> +			 * pixel center for Cb'Cr' is between the left top and
> +			 * bottom pixel in a 2x2 block, so take the average.
> +			 */
> +			uv[j * 2 + 0] = (yuv[0].d[1] + yuv[2].d[1]) / 2.0f;
> +			uv[j * 2 + 1] = (yuv[0].d[2] + yuv[2].d[2]) / 2.0f;
>  		}
>  
> -		rgb24 += rgb24_stride;
> -		y += planar_stride;
> -	}
> +		if (fb->width & 1) {
> +			/* Convert 1x2 pixel block */
> +			struct igt_vec4 rgb[2];
> +			struct igt_vec4 yuv[2];
>  
> -	rgb24 = blit->rgb24.map;
> +			read_rgb(&rgb[0], &rgb24[j * 8 + 0]);
> +			read_rgb(&rgb[2], &rgb24[j * 8 + 0 + rgb24_stride]);
> +
> +			yuv[0] = igt_matrix_transform(&m, &rgb[0]);
> +			yuv[1] = igt_matrix_transform(&m, &rgb[1]);
> +
> +			y[j * 2 + 0] = yuv[0].d[0];
> +			y[j * 2 + 0 + planar_stride] = yuv[1].d[0];
>  
> -	for (i = 0; i < fb->height / 2; i++) {
> -		for (j = 0; j < fb->plane_width[1]; j++) {
>  			/*
> -			 * Pixel center for Cb'Cr' is between the left top and
> +			 * We assume the MPEG2 chroma siting convention, where
> +			 * pixel center for Cb'Cr' is between the left top and
>  			 * bottom pixel in a 2x2 block, so take the average.
>  			 */
> -			float uf = -0.148f/2 * rgb24[j * 8 + 2] +
> -				   -0.148f/2 * rgb24[j * 8 + 2 + rgb24_stride] +
> -				   -0.291f/2 * rgb24[j * 8 + 1] +
> -				   -0.291f/2 * rgb24[j * 8 + 1 + rgb24_stride] +
> -				    0.439f/2 * rgb24[j * 8] +
> -				    0.439f/2 * rgb24[j * 8 + rgb24_stride] + 128;
> -			float vf =  0.439f/2 * rgb24[j * 8 + 2] +
> -				    0.439f/2 * rgb24[j * 8 + 2 + rgb24_stride] +
> -				   -0.368f/2 * rgb24[j * 8 + 1] +
> -				   -0.368f/2 * rgb24[j * 8 + 1 + rgb24_stride] +
> -				   -0.071f/2 * rgb24[j * 8] +
> -				   -0.071f/2 * rgb24[j * 8 + rgb24_stride] + 128;
> -			uv[j * 2] = (uint8_t)uf;
> -			uv[j * 2 + 1] = (uint8_t)vf;
> +			uv[j * 2 + 0] = (yuv[0].d[1] + yuv[1].d[1]) / 2.0f;
> +			uv[j * 2 + 1] = (yuv[0].d[2] + yuv[1].d[2]) / 2.0f;
>  		}
>  
>  		rgb24 += 2 * rgb24_stride;
> +		y += 2 * planar_stride;
>  		uv += planar_stride;
>  	}
>  
>  	/* Last row cannot be interpolated between 2 pixels, take the single value */
> -	if (i < fb->plane_height[1]) {
> -		for (j = 0; j < fb->plane_width[1]; j++) {
> -			float uf = -0.148f * rgb24[j * 8 + 2] +
> -				   -0.291f * rgb24[j * 8 + 1] +
> -				    0.439f * rgb24[j * 8] + 128;
> -			float vf =  0.439f * rgb24[j * 8 + 2] +
> -				   -0.368f * rgb24[j * 8 + 1] +
> -				   -0.071f * rgb24[j * 8] + 128;
> -
> -			uv[j * 2] = (uint8_t)uf;
> -			uv[j * 2 + 1] = (uint8_t)vf;
> +	if (fb->height & 1) {
> +		for (j = 0; j < fb->width / 2; j++) {
> +			/* Convert 2x1 pixel blocks */
> +			struct igt_vec4 rgb[2];
> +			struct igt_vec4 yuv[2];
> +
> +			read_rgb(&rgb[0], &rgb24[j * 8 + 0]);
> +			read_rgb(&rgb[1], &rgb24[j * 8 + 4]);
> +
> +			yuv[0] = igt_matrix_transform(&m, &rgb[0]);
> +			yuv[1] = igt_matrix_transform(&m, &rgb[1]);
> +
> +			y[j * 2 + 0] = yuv[0].d[0];
> +			y[j * 2 + 1] = yuv[1].d[0];
> +			uv[j * 2 + 0] = yuv[0].d[1];
> +			uv[j * 2 + 1] = yuv[0].d[2];
> +		}
> +
> +		if (fb->width & 1) {
> +			/* Convert single pixel */
> +			struct igt_vec4 rgb;
> +			struct igt_vec4 yuv;
> +
> +			read_rgb(&rgb, &rgb24[j * 8 + 0]);
> +
> +			yuv = igt_matrix_transform(&m, &rgb);
> +
> +			y[j * 2 + 0] = yuv.d[0];
> +			uv[j * 2 + 0] = yuv.d[1];
> +			uv[j * 2 + 1] = yuv.d[2];
>  		}
>  	}
>  }

We could get rid of the height & 1 and width & 1 checks, we only allow an alignment with a multiple of 4 in i915, and I think restricting it to 2x2 in igt won't lose us much...

One thing we should definitely do is making igt_matrix_transform inline, and enable vectorization somehow. The current code is horribly unoptimized. :(