[PATCH 2/4] drm/ast: cursor: Move format conversion to shared helper

Thomas Zimmermann tzimmermann at suse.de
Fri Feb 14 09:36:57 UTC 2025


Hi,

here's some additional information this patch.

Am 13.02.25 um 17:25 schrieb Thomas Zimmermann:
[...]
> +
> +		drm_fb_argb8888_to_argb4444(argb4444_dst, argb4444_dst_pitch,
> +					    shadow_plane_state->data, fb, &damage,
> +					    &shadow_plane_state->fmtcnv_state);
[...]
>   
> +static void drm_fb_argb8888_to_argb4444_line(void *dbuf, const void *sbuf, unsigned int pixels)
> +{
> +	unsigned int pixels2 = pixels & ~GENMASK_ULL(0, 0);
> +	__le32 *dbuf32 = dbuf;
> +	__le16 *dbuf16 = dbuf + pixels2 * sizeof(*dbuf16);
> +	const __le32 *sbuf32 = sbuf;
> +	unsigned int x;
> +	u32 val32;
> +	u16 val16;
> +	u32 pix[2];
> +
> +	for (x = 0; x < pixels2; x += 2, ++dbuf32) {
> +		pix[0] = le32_to_cpu(sbuf32[x]);
> +		pix[1] = le32_to_cpu(sbuf32[x + 1]);
> +		val32 = ((pix[0] & 0xf0000000) >> 16) |
> +			((pix[0] & 0x00f00000) >> 12) |
> +			((pix[0] & 0x0000f000) >> 8) |
> +			((pix[0] & 0x000000f0) >> 4) |
> +			((pix[1] & 0xf0000000) >> 0) |
> +			((pix[1] & 0x00f00000) << 4) |
> +			((pix[1] & 0x0000f000) << 8) |
> +			((pix[1] & 0x000000f0) << 12);
> +		*dbuf32 = cpu_to_le32(val32);
> +	}

This loop is an optimization. It converts two source pixels at a time 
and writes them with a 32-bit store. When I measured the impact, I was 
quite delighted by the results. I measured the time it takes to convert 
a full cursor image of 64x64 pixels with drm_fb_argb8888_to_argb4444(), 
and then looked at the average.

Without this loop in place, the average runtime stabilizes around 97K 
nsecs. Here are the final results

[  406.420664] ast 0000:02:00.0: [drm] count=8448 average=97239 nsec
[  414.869034] ast 0000:02:00.0: [drm] count=8704 average=97005 nsec
[  425.665928] ast 0000:02:00.0: [drm] count=8960 average=97096 nsec
[  435.185207] ast 0000:02:00.0: [drm] count=9216 average=96711 nsec
[  442.244948] ast 0000:02:00.0: [drm] count=9472 average=96432 nsec

Count is the number of probes. The time has been taken with ktime_get_ns().

With the additional loop, the values stabilize around 52K nsecs.

[  348.797840] ast 0000:02:00.0: [drm] count=8448 average=51729
[  356.503387] ast 0000:02:00.0: [drm] count=8704 average=51680
[  364.151804] ast 0000:02:00.0: [drm] count=8960 average=51574
[  372.412221] ast 0000:02:00.0: [drm] count=9216 average=51563
[  425.158072] ast 0000:02:00.0: [drm] count=9472 average=51674

That's only ~53% of the unoptimized case.

Given these results, I'll try to add similar optimizations to other 
format-conversion helpers. Most of the format conversion happens for 
drivers with only a single output format, such as simpledrm. For 
full-screen pageflips on such drivers, it might even make a visible 
difference.

Best regards
Thomas


> +	for (; x < pixels; x++) {
> +		pix[0] = le32_to_cpu(sbuf32[x]);
> +		val16 = ((pix[0] & 0xf0000000) >> 16) |
> +			((pix[0] & 0x00f00000) >> 12) |
> +			((pix[0] & 0x0000f000) >> 8) |
> +			((pix[0] & 0x000000f0) >> 4);
> +		dbuf16[x] = cpu_to_le16(val16);
> +	}
> +}
> +
> +/**
> + * drm_fb_argb8888_to_argb4444 - Convert ARGB8888 to ARGB4444 clip buffer
> + * @dst: Array of ARGB4444 destination buffers
> + * @dst_pitch: Array of numbers of bytes between the start of two consecutive scanlines
> + *             within @dst; can be NULL if scanlines are stored next to each other.
> + * @src: Array of ARGB8888 source buffer
> + * @fb: DRM framebuffer
> + * @clip: Clip rectangle area to copy
> + * @state: Transform and conversion state
> + *
> + * This function copies parts of a framebuffer to display memory and converts
> + * the color format during the process. The parameters @dst, @dst_pitch and
> + * @src refer to arrays. Each array must have at least as many entries as
> + * there are planes in @fb's format. Each entry stores the value for the
> + * format's respective color plane at the same index.
> + *
> + * This function does not apply clipping on @dst (i.e. the destination is at the
> + * top-left corner).
> + *
> + * Drivers can use this function for ARGB4444 devices that don't support
> + * ARGB8888 natively.
> + */
> +void drm_fb_argb8888_to_argb4444(struct iosys_map *dst, const unsigned int *dst_pitch,
> +				 const struct iosys_map *src, const struct drm_framebuffer *fb,
> +				 const struct drm_rect *clip, struct drm_format_conv_state *state)
> +{
> +	static const u8 dst_pixsize[DRM_FORMAT_MAX_PLANES] = {
> +		2,
> +	};
> +
> +	drm_fb_xfrm(dst, dst_pitch, dst_pixsize, src, fb, clip, false, state,
> +		    drm_fb_argb8888_to_argb4444_line);
> +}
> +EXPORT_SYMBOL(drm_fb_argb8888_to_argb4444);
> +
>   /**
>    * drm_fb_blit - Copy parts of a framebuffer to display memory
>    * @dst:	Array of display-memory addresses to copy to
> diff --git a/include/drm/drm_format_helper.h b/include/drm/drm_format_helper.h
> index 428d81afe215..a1347e47e9d5 100644
> --- a/include/drm/drm_format_helper.h
> +++ b/include/drm/drm_format_helper.h
> @@ -110,6 +110,9 @@ void drm_fb_xrgb8888_to_argb2101010(struct iosys_map *dst, const unsigned int *d
>   void drm_fb_xrgb8888_to_gray8(struct iosys_map *dst, const unsigned int *dst_pitch,
>   			      const struct iosys_map *src, const struct drm_framebuffer *fb,
>   			      const struct drm_rect *clip, struct drm_format_conv_state *state);
> +void drm_fb_argb8888_to_argb4444(struct iosys_map *dst, const unsigned int *dst_pitch,
> +				 const struct iosys_map *src, const struct drm_framebuffer *fb,
> +				 const struct drm_rect *clip, struct drm_format_conv_state *state);
>   
>   int drm_fb_blit(struct iosys_map *dst, const unsigned int *dst_pitch, uint32_t dst_format,
>   		const struct iosys_map *src, const struct drm_framebuffer *fb,

-- 
--
Thomas Zimmermann
Graphics Driver Developer
SUSE Software Solutions Germany GmbH
Frankenstrasse 146, 90461 Nuernberg, Germany
GF: Ivo Totev, Andrew Myers, Andrew McDonald, Boudien Moerman
HRB 36809 (AG Nuernberg)



More information about the dri-devel mailing list