[PATCH 8/8] drm/tinydrm: Relax buffer line prefetch

Wed Oct 4 18:06:35 UTC 2017

vmalloc BO's gives us cached reads, so no need to prefetch in that case.
Prefetching gives a ~20% speedup on a cma buffer using the mi0283qt
driver on a Raspberry Pi 1.

Signed-off-by: Noralf Trønnes <noralf at tronnes.org>
---
 drivers/gpu/drm/tinydrm/core/tinydrm-helpers.c | 54 ++++++++++++++------------
 1 file changed, 30 insertions(+), 24 deletions(-)

diff --git a/drivers/gpu/drm/tinydrm/core/tinydrm-helpers.c b/drivers/gpu/drm/tinydrm/core/tinydrm-helpers.c
index ee9a8f3..bca9052 100644
--- a/drivers/gpu/drm/tinydrm/core/tinydrm-helpers.c
+++ b/drivers/gpu/drm/tinydrm/core/tinydrm-helpers.c
@@ -15,6 +15,8 @@
 #include <linux/swab.h>
 
 #include <drm/drmP.h>
+#include <drm/drm_gem.h>
+#include <drm/drm_gem_framebuffer_helper.h>
 #include <drm/tinydrm/tinydrm.h>
 #include <drm/tinydrm/tinydrm-helpers.h>
 
@@ -115,22 +117,25 @@ void tinydrm_swab16(u16 *dst, void *vaddr, struct drm_framebuffer *fb,
 		    struct drm_clip_rect *clip)
 {
 	size_t len = (clip->x2 - clip->x1) * sizeof(u16);
+	u16 *src, *buf = NULL;
 	unsigned int x, y;
-	u16 *src, *buf;
 
 	/*
-	 * The cma memory is write-combined so reads are uncached.
-	 * Speed up by fetching one line at a time.
+	 * Imported buffers are likely to be write-combined with uncached
+	 * reads. Speed up by fetching one line at a time.
+	 * prefetch_range() was tried, but didn't give any noticeable speedup
+	 * on the Raspberry Pi 1.
 	 */
-	buf = kmalloc(len, GFP_KERNEL);
-	if (!buf)
-		return;
+	if (drm_gem_fb_get_obj(fb, 0)->import_attach)
+		buf = kmalloc(len, GFP_KERNEL);
 
 	for (y = clip->y1; y < clip->y2; y++) {
 		src = vaddr + (y * fb->pitches[0]);
 		src += clip->x1;
-		memcpy(buf, src, len);
-		src = buf;
+		if (buf) {
+			memcpy(buf, src, len);
+			src = buf;
+		}
 		for (x = clip->x1; x < clip->x2; x++)
 			*dst++ = swab16(*src++);
 	}
@@ -155,19 +160,21 @@ void tinydrm_xrgb8888_to_rgb565(u16 *dst, void *vaddr,
 				struct drm_clip_rect *clip, bool swap)
 {
 	size_t len = (clip->x2 - clip->x1) * sizeof(u32);
+	u32 *src, *buf = NULL;
 	unsigned int x, y;
-	u32 *src, *buf;
 	u16 val16;
 
-	buf = kmalloc(len, GFP_KERNEL);
-	if (!buf)
-		return;
+	/* See tinydrm_swab16() for an explanation */
+	if (drm_gem_fb_get_obj(fb, 0)->import_attach)
+		buf = kmalloc(len, GFP_KERNEL);
 
 	for (y = clip->y1; y < clip->y2; y++) {
 		src = vaddr + (y * fb->pitches[0]);
 		src += clip->x1;
-		memcpy(buf, src, len);
-		src = buf;
+		if (buf) {
+			memcpy(buf, src, len);
+			src = buf;
+		}
 		for (x = clip->x1; x < clip->x2; x++) {
 			val16 = ((*src & 0x00F80000) >> 8) |
 				((*src & 0x0000FC00) >> 5) |
@@ -205,24 +212,23 @@ void tinydrm_xrgb8888_to_gray8(u8 *dst, void *vaddr, struct drm_framebuffer *fb,
 {
 	unsigned int len = (clip->x2 - clip->x1) * sizeof(u32);
 	unsigned int x, y;
-	void *buf;
+	void *buf = NULL;
 	u32 *src;
 
 	if (WARN_ON(fb->format->format != DRM_FORMAT_XRGB8888))
 		return;
-	/*
-	 * The cma memory is write-combined so reads are uncached.
-	 * Speed up by fetching one line at a time.
-	 */
-	buf = kmalloc(len, GFP_KERNEL);
-	if (!buf)
-		return;
+
+	/* See tinydrm_swab16() for an explanation */
+	if (drm_gem_fb_get_obj(fb, 0)->import_attach)
+		buf = kmalloc(len, GFP_KERNEL);
 
 	for (y = clip->y1; y < clip->y2; y++) {
 		src = vaddr + (y * fb->pitches[0]);
 		src += clip->x1;
-		memcpy(buf, src, len);
-		src = buf;
+		if (buf) {
+			memcpy(buf, src, len);
+			src = buf;
+		}
 		for (x = clip->x1; x < clip->x2; x++) {
 			u8 r = (*src & 0x00ff0000) >> 16;
 			u8 g = (*src & 0x0000ff00) >> 8;
-- 
2.7.4