[igt-dev] [PATCH i-g-t 8/8] lib/igt_fb: Add support for NV12 format through conversion
Maarten Lankhorst
maarten.lankhorst at linux.intel.com
Wed Jan 31 15:09:08 UTC 2018
Op 31-01-18 om 15:32 schreef Ville Syrjälä:
> On Wed, Jan 31, 2018 at 03:45:37PM +0200, Mika Kahola wrote:
>> On Tue, 2018-01-23 at 13:56 +0100, Maarten Lankhorst wrote:
>>> For NV12 a format conversion is needed. Because YUV formats are not
>>> fully defined with just a fourcc, I've chosen BT.709 limited range.
>>> This puts the pixel center of the CbCr components between the top
>>> left Y and bottom left Y:
>>>
>>> Y Y Y Y
>>> UV UV
>>> Y Y Y Y
>>>
>>> Some work is put into optimising the conversion routines in order to
>>> make it fast enough. Before converting nv12 to rgb24, it is copied to
>>> a temporary buffer to take advantage of memory caching. This is
>>> approximately 20x faster than directly reading the BO.
>>>
>>> When testing on my KBL with a 1080p buffer, it takes approximately
>>> .1s to convert either way, this is fast enough not to bother
>>> optimising
>>> even further for me.
>>>
>>> Signed-off-by: Maarten Lankhorst <maarten.lankhorst at linux.intel.com>
>>> ---
>>> lib/igt_fb.c | 390 ++++++++++++++++++++++++++++++++++++++++++++++++-
>>> ----------
>>> 1 file changed, 318 insertions(+), 72 deletions(-)
>>>
>>> diff --git a/lib/igt_fb.c b/lib/igt_fb.c
>>> index 4d6b62a64ded..43b71e643586 100644
>>> --- a/lib/igt_fb.c
>>> +++ b/lib/igt_fb.c
>>> @@ -70,6 +70,7 @@ static struct format_desc_struct {
>>> DF(XRGB8888, RGB24, 32, 24),
>>> DF(XRGB2101010, RGB30, 32, 30),
>>> DF(ARGB8888, ARGB32, 32, 32),
>>> + DF(NV12, RGB24, 32, -1, 2, {8, 16}),
>>> };
>>> #undef DF
>>>
>>> @@ -363,14 +364,20 @@ static int create_bo_for_fb(int fd, int width,
>>> int height,
>>> *is_dumb = false;
>>>
>>> if (is_i915_device(fd)) {
>>> - uint32_t *ptr;
>>> + uint8_t *ptr;
>>>
>>> bo = gem_create(fd, size);
>>> gem_set_tiling(fd, bo,
>>> igt_fb_mod_to_tiling(tiling), stride);
>>>
>>> /* Ensure the framebuffer is preallocated */
>>> - ptr = gem_mmap__gtt(fd, bo, size,
>>> PROT_READ);
>>> - igt_assert(*ptr == 0);
>>> + ptr = gem_mmap__gtt(fd, bo, size, PROT_READ
>>> | PROT_WRITE);
>>> + igt_assert(*(uint32_t *)ptr == 0);
>>> +
>>> + if (format->drm_id == DRM_FORMAT_NV12) {
>>> + /* component formats have a
>>> different zero point */
>>> + memset(ptr, 16, offsets[1]);
>>> + memset(ptr + offsets[1], 0x80,
>>> (height + 1)/2 * stride);
>>> + }
>>> gem_munmap(ptr, size);
>>>
>>> if (size_ret)
>>> @@ -1126,103 +1133,117 @@ static cairo_format_t
>>> drm_format_to_cairo(uint32_t drm_format)
>>> drm_format, igt_format_str(drm_format));
>>> }
>>>
>>> +struct fb_blit_linear {
>>> + uint32_t handle;
>>> + unsigned size, stride;
>>> + uint8_t *map;
>>> + bool is_dumb;
>>> + uint32_t offsets[4];
>>> +};
>>> +
>>> struct fb_blit_upload {
>>> int fd;
>>> struct igt_fb *fb;
>>> - struct {
>>> - uint32_t handle;
>>> - unsigned size, stride;
>>> - uint8_t *map;
>>> - bool is_dumb;
>>> - } linear;
>>> + struct fb_blit_linear linear;
>>> };
>>>
>>> -static void destroy_cairo_surface__blit(void *arg)
>>> +static void free_linear_mapping(int fd, struct igt_fb *fb, struct
>>> fb_blit_linear *linear)
>>> {
>>> - struct fb_blit_upload *blit = arg;
>>> - struct igt_fb *fb = blit->fb;
>>> unsigned int obj_tiling = igt_fb_mod_to_tiling(fb->tiling);
>>> + int i;
>>>
>>> - gem_munmap(blit->linear.map, blit->linear.size);
>>> - fb->cairo_surface = NULL;
>>> + gem_munmap(linear->map, linear->size);
>>> + gem_set_domain(fd, linear->handle,
>>> + I915_GEM_DOMAIN_GTT, 0);
>>> +
>>> + for (i = 0; i < fb->num_planes; i++)
>>> + igt_blitter_fast_copy__raw(fd,
>>> + linear->handle,
>>> + linear->offsets[i],
>>> + linear->stride,
>>> + I915_TILING_NONE,
>>> + 0, 0, /* src_x, src_y */
>>> + fb->plane_width[i], fb-
>>>> plane_height[i],
>>> + fb->plane_bpp[i],
>>> + fb->gem_handle,
>>> + fb->offsets[i],
>>> + fb->stride,
>>> + obj_tiling,
>>> + 0, 0 /* dst_x, dst_y */);
>>> +
>>> + gem_sync(fd, linear->handle);
>>> + gem_close(fd, linear->handle);
>>> +}
>>>
>>> - gem_set_domain(blit->fd, blit->linear.handle,
>>> - I915_GEM_DOMAIN_GTT, 0);
>>> +static void destroy_cairo_surface__blit(void *arg)
>>> +{
>>> + struct fb_blit_upload *blit = arg;
>>>
>>> - igt_blitter_fast_copy__raw(blit->fd,
>>> - blit->linear.handle, 0,
>>> - blit->linear.stride,
>>> - I915_TILING_NONE,
>>> - 0, 0, /* src_x, src_y */
>>> - fb->width, fb->height,
>>> - igt_drm_format_to_bpp(fb-
>>>> drm_format),
>>> - fb->gem_handle,
>>> - fb->offsets[0],
>>> - fb->stride,
>>> - obj_tiling,
>>> - 0, 0 /* dst_x, dst_y */);
>>> -
>>> - gem_sync(blit->fd, blit->linear.handle);
>>> - gem_close(blit->fd, blit->linear.handle);
>>> + blit->fb->cairo_surface = NULL;
>>> +
>>> + free_linear_mapping(blit->fd, blit->fb, &blit->linear);
>>>
>>> free(blit);
>>> }
>>>
>>> -static void create_cairo_surface__blit(int fd, struct igt_fb *fb)
>>> +static void setup_linear_mapping(int fd, struct igt_fb *fb, struct
>>> fb_blit_linear *linear)
>>> {
>>> - struct fb_blit_upload *blit;
>>> - cairo_format_t cairo_format;
>>> unsigned int obj_tiling = igt_fb_mod_to_tiling(fb->tiling);
>>> - uint32_t offsets[4];
>>> -
>>> - blit = malloc(sizeof(*blit));
>>> - igt_assert(blit);
>>> + int i;
>>>
>>> /*
>>> * We create a linear BO that we'll map for the CPU to write
>>> to (using
>>> * cairo). This linear bo will be then blitted to its final
>>> * destination, tiling it at the same time.
>>> */
>>> - blit->linear.handle = create_bo_for_fb(fd, fb->width, fb-
>>>> height,
>>> + linear->handle = create_bo_for_fb(fd, fb->width, fb->height,
>>> lookup_drm_format(fb-
>>>> drm_format),
>>> LOCAL_DRM_FORMAT_MOD_
>>> NONE, 0,
>>> - 0, &blit-
>>>> linear.size,
>>> - &blit->linear.stride,
>>> - offsets, &blit-
>>>> linear.is_dumb);
>>> -
>>> - igt_assert(blit->linear.handle > 0);
>>> + 0, &linear->size,
>>> + &linear->stride,
>>> + linear->offsets,
>>> &linear->is_dumb);
>>>
>>> - blit->fd = fd;
>>> - blit->fb = fb;
>>> + igt_assert(linear->handle > 0);
>>>
>>> /* Copy fb content to linear BO */
>>> - gem_set_domain(fd, blit->linear.handle,
>>> + gem_set_domain(fd, linear->handle,
>>> I915_GEM_DOMAIN_GTT, 0);
>>>
>>> - igt_blitter_fast_copy__raw(fd,
>>> - fb->gem_handle,
>>> - fb->offsets[0],
>>> - fb->stride,
>>> - obj_tiling,
>>> - 0, 0, /* src_x, src_y */
>>> - fb->width, fb->height,
>>> - igt_drm_format_to_bpp(fb-
>>>> drm_format),
>>> - blit->linear.handle, 0,
>>> - blit->linear.stride,
>>> - I915_TILING_NONE,
>>> - 0, 0 /* dst_x, dst_y */);
>>> -
>>> - gem_sync(fd, blit->linear.handle);
>>> -
>>> - gem_set_domain(fd, blit->linear.handle,
>>> + for (i = 0; i < fb->num_planes; i++)
>>> + igt_blitter_fast_copy__raw(fd,
>>> + fb->gem_handle,
>>> + fb->offsets[i],
>>> + fb->stride,
>>> + obj_tiling,
>>> + 0, 0, /* src_x, src_y */
>>> + fb->plane_width[i], fb-
>>>> plane_height[i],
>>> + fb->plane_bpp[i],
>>> + linear->handle, linear-
>>>> offsets[i],
>>> + linear->stride,
>>> + I915_TILING_NONE,
>>> + 0, 0 /* dst_x, dst_y */);
>>> +
>>> + gem_sync(fd, linear->handle);
>>> +
>>> + gem_set_domain(fd, linear->handle,
>>> I915_GEM_DOMAIN_CPU, I915_GEM_DOMAIN_CPU);
>>>
>>> /* Setup cairo context */
>>> - blit->linear.map = gem_mmap__cpu(fd,
>>> - blit->linear.handle,
>>> - 0,
>>> - blit->linear.size,
>>> - PROT_READ | PROT_WRITE);
>>> + linear->map = gem_mmap__cpu(fd, linear->handle,
>>> + 0, linear->size, PROT_READ |
>>> PROT_WRITE);
>>> +}
>>> +
>>> +static void create_cairo_surface__blit(int fd, struct igt_fb *fb)
>>> +{
>>> + struct fb_blit_upload *blit;
>>> + cairo_format_t cairo_format;
>>> +
>>> + blit = malloc(sizeof(*blit));
>>> + igt_assert(blit);
>>> +
>>> + blit->fd = fd;
>>> + blit->fb = fb;
>>> + setup_linear_mapping(fd, fb, &blit->linear);
>>>
>>> cairo_format = drm_format_to_cairo(fb->drm_format);
>>> fb->cairo_surface =
>>> @@ -1284,6 +1305,232 @@ static void create_cairo_surface__gtt(int fd,
>>> struct igt_fb *fb)
>>> fb, destroy_cairo_surface__gtt);
>>> }
>>>
>>> +struct fb_convert_blit_upload {
>>> + int fd;
>>> + struct igt_fb *fb;
>>> +
>>> + struct {
>>> + uint8_t *map;
>>> + unsigned stride, size;
>>> + } rgb24;
>>> +
>>> + struct fb_blit_linear linear;
>>> +};
>>> +
>>> +static uint8_t clamprgb(float val) {
>>> + if (val < 0)
>>> + return 0;
>>> + if (val > 255)
>>> + return 255;
>>> +
>>> + return (uint8_t)val;
>> I'm thinking of this rounding issue. Should we use rounding to the
>> nearest integer here, instead of floor? Does the way we do rounding
>> have an effect on test results if we first grab the reference CRC and
>> then run a test in NV12 format?
>>
>>> +}
>>> +
>>> +static void convert_nv12_to_rgb24(struct igt_fb *fb, struct
>>> fb_convert_blit_upload *blit)
>>> +{
>>> + int i, j;
>>> + const uint8_t *y, *uv;
>>> + uint8_t *rgb24 = blit->rgb24.map;
>>> + unsigned rgb24_stride = blit->rgb24.stride, planar_stride =
>>> blit->linear.stride;
>>> + uint8_t *buf = malloc(blit->linear.size);
>>> +
>>> + /*
>>> + * Reading from the BO is awfully slow because of lack of
>>> read caching,
>>> + * it's faster to copy the whole BO to a temporary buffer
>>> and convert
>>> + * from there.
>>> + */
>>> + memcpy(buf, blit->linear.map, blit->linear.size);
>>> + y = &buf[blit->linear.offsets[0]];
>>> + uv = &buf[blit->linear.offsets[1]];
>>> +
>>> + for (i = 0; i < fb->height / 2; i++) {
>>> + for (j = 0; j < fb->width; j++) {
>>> + float r_, g_, b_, y0, y1, cb, cr;
>>> + /* Convert 1x2 pixel blocks */
>>> +
>>> + y0 = 1.164f * (y[j] - 16.f);
>>> + y1 = 1.164f * (y[j + planar_stride] - 16.f);
>>> +
>>> + cb = uv[j & ~1] - 128.f;
>>> + cr = uv[j | 1] - 128.f;
>>> +
>>> + r_ = 0.000f * cb + 1.793f * cr;
>>> + g_ = -0.213f * cb + -0.533f * cr;
>>> + b_ = 2.112f * cb + 0.000f * cr;
>>> +
>>> + rgb24[j * 4 + 2] = clamprgb(y0 + r_);
>>> + rgb24[j * 4 + 2 + rgb24_stride] =
>>> clamprgb(y1 + r_);
>>> +
>>> + rgb24[j * 4 + 1] = clamprgb(y0 + g_);
>>> + rgb24[j * 4 + 1 + rgb24_stride] =
>>> clamprgb(y1 + g_);
>>> +
>>> + rgb24[j * 4] = clamprgb(y0 + b_);
>>> + rgb24[j * 4 + rgb24_stride] = clamprgb(y1 +
>>> b_);
>>> + }
>>> +
>>> + rgb24 += 2 * rgb24_stride;
>>> + y += 2 * planar_stride;
>>> + uv += planar_stride;
>>> + }
>>> +
>>> + if (fb->height & 1) {
>>> + /* Convert last row */
>>> + for (j = 0; j < fb->width; j++) {
>>> + float r_, g_, b_, y0, cb, cr;
>>> + /* Convert single pixel */
>>> +
>>> + cb = uv[j & ~1] - 128.f;
>>> + cr = uv[j | 1] - 128.f;
>>> +
>>> + y0 = 1.164f * (y[j] - 16.f);
>>> + r_ = 0.000f * cb + 1.793f * cr;
>>> + g_ = -0.213f * cb + -0.533f * cr;
>>> + b_ = 2.112f * cb + 0.000f * cr;
>>> +
>> Do you have a reference for these conversion coefficients? According
>> to https://www.fourcc.org/fccyvrgb.php there are a few opinions how
>> this conversion should be handled?
> We probably don't want to hardcode these in the code since we'll want to
> test other encodings besides limited range BT.601. I think the cleanest
> approach is to derive the coefficients from the Kr/Kb values, but that
> might not lend itself well to tweaking the coefficients to match the
> hardware output (if that's necessary).
This is limited range BT.709. If we ever add support for encodings we can worry about it. :)
I think being 100% color accurate is not necessary for NV12, just as long as we are consistent.
>>> + rgb24[j * 4 + 2] = clamprgb(y0 + r_);
>>> + rgb24[j * 4 + 1] = clamprgb(y0 + g_);
>>> + rgb24[j * 4] = clamprgb(y0 + b_);
>>> + }
>>> + }
>>> +
>>> + free(buf);
>>> +}
>>> +
>>> +static void convert_rgb24_to_nv12(struct igt_fb *fb, struct
>>> fb_convert_blit_upload *blit)
>>> +{
>>> + int i, j;
>>> + uint8_t *y = &blit->linear.map[blit->linear.offsets[0]];
>>> + uint8_t *uv = &blit->linear.map[blit->linear.offsets[1]];
>>> + const uint8_t *rgb24 = blit->rgb24.map;
>>> + unsigned rgb24_stride = blit->rgb24.stride;
>>> + unsigned planar_stride = blit->linear.stride;
>>> +
>>> + igt_assert_f(fb->drm_format == DRM_FORMAT_NV12,
>>> + "Conversion not implemented for !NV12 planar
>>> formats\n");
>>> +
>>> + for (i = 0; i < fb->plane_height[0]; i++) {
>>> + /* Use limited color range BT.709 */
>>> +
>>> + for (j = 0; j < fb->plane_width[0]; j++) {
>>> + float yf = 0.183f * rgb24[j * 4 + 2] +
>>> + 0.614f * rgb24[j * 4 + 1] +
>>> + 0.062f * rgb24[j * 4] + 16;
>>> +
>>> + y[j] = (uint8_t)yf;
>> We could use clamprgb() here too. Just in case.
Since it's limited range, I don't think we'll hit it in practice..
>>> 0.183 * 255 + .614 * 255 + .062 * 255 + 16
235.045
>>> 0.183 * 0 + .614 * 0 + .062 * 0 + 16
16.0
u min/max:
>>> -.101 * 255 - .339 * 255 + 128
15.799999999999983
>>> .439 * 255 + 128
239.945
v min/max:
>>> -.339 * 255 + -.040 * 255 + 128
31.35499999999999
>>> .439 * 255 + 128
239.945
So it's pretty much contained between 15 and 240, no need to clamp..
>>> + }
>>> +
>>> + rgb24 += rgb24_stride;
>>> + y += planar_stride;
>>> + }
>>> +
>>> + rgb24 = blit->rgb24.map;
>>> +
>>> + for (i = 0; i < fb->height / 2; i++) {
>>> + for (j = 0; j < fb->plane_width[1]; j++) {
>>> + /*
>>> + * Pixel center for Cb'Cr' is between the
>>> left top and
>>> + * bottom pixel in a 2x2 block, so take the
>>> average.
>>> + */
>>> + float uf = -0.101f/2 * rgb24[j * 8 + 2] +
>>> + -0.101f/2 * rgb24[j * 8 + 2 +
>>> rgb24_stride] +
>>> + -0.339f/2 * rgb24[j * 8 + 1] +
>>> + -0.339f/2 * rgb24[j * 8 + 1 +
>>> rgb24_stride] +
>>> + 0.439f/2 * rgb24[j * 8] +
>>> + 0.439f/2 * rgb24[j * 8 +
>>> rgb24_stride] + 128;
>>> + float vf = 0.439f/2 * rgb24[j * 8 + 2] +
>>> + 0.439f/2 * rgb24[j * 8 + 2 +
>>> rgb24_stride] +
>>> + -0.339f/2 * rgb24[j * 8 + 1] +
>>> + -0.339f/2 * rgb24[j * 8 + 1 +
>>> rgb24_stride] +
>>> + -0.040f/2 * rgb24[j * 8] +
>>> + -0.040f/2 * rgb24[j * 8 +
>>> rgb24_stride] + 128;
>>> + uv[j * 2] = (uint8_t)uf;
>>> + uv[j * 2 + 1] = (uint8_t)vf;
>>>
>> Maybe add rounding & clamping here too?
>>>
>>> + }
>>> +
>>> + rgb24 += 2 * rgb24_stride;
>>> + uv += planar_stride;
>>> + }
>>> +
>>> + /* Last row cannot be interpolated between 2 pixels, take
>>> the single value */
>>> + if (i < fb->plane_height[1]) {
>>> + for (j = 0; j < fb->plane_width[1]; j++) {
>>> + float uf = -0.101f * rgb24[j * 8 + 2] +
>>> + -0.339f * rgb24[j * 8 + 1] +
>>> + 0.439f * rgb24[j * 8] + 128;
>>> + float vf = 0.439f * rgb24[j * 8 + 2] +
>>> + -0.339f * rgb24[j * 8 + 1] +
>>> + -0.040f * rgb24[j * 8] + 128;
>>> +
>>> + uv[j * 2] = (uint8_t)uf;
>>> + uv[j * 2 + 1] = (uint8_t)vf;
>> rounding & clamping?
I left out the clamping for Y and UV because we can overflow/undeflow by 16. If necessary I can re-add it. :)
~Maarten
More information about the igt-dev
mailing list