[PATCH v2 8/8] drm/tinydrm: Relax buffer line prefetch
Noralf Trønnes
noralf at tronnes.org
Sun Oct 15 16:30:42 UTC 2017
vmalloc BO's gives us cached reads, so no need to prefetch in that case.
Prefetching gives a ~20% speedup on a cma buffer using the mi0283qt
driver on a Raspberry Pi 1.
Signed-off-by: Noralf Trønnes <noralf at tronnes.org>
---
drivers/gpu/drm/tinydrm/core/tinydrm-helpers.c | 54 ++++++++++++++------------
1 file changed, 30 insertions(+), 24 deletions(-)
diff --git a/drivers/gpu/drm/tinydrm/core/tinydrm-helpers.c b/drivers/gpu/drm/tinydrm/core/tinydrm-helpers.c
index ee9a8f305b26..bca905213cdd 100644
--- a/drivers/gpu/drm/tinydrm/core/tinydrm-helpers.c
+++ b/drivers/gpu/drm/tinydrm/core/tinydrm-helpers.c
@@ -15,6 +15,8 @@
#include <linux/swab.h>
#include <drm/drmP.h>
+#include <drm/drm_gem.h>
+#include <drm/drm_gem_framebuffer_helper.h>
#include <drm/tinydrm/tinydrm.h>
#include <drm/tinydrm/tinydrm-helpers.h>
@@ -115,22 +117,25 @@ void tinydrm_swab16(u16 *dst, void *vaddr, struct drm_framebuffer *fb,
struct drm_clip_rect *clip)
{
size_t len = (clip->x2 - clip->x1) * sizeof(u16);
+ u16 *src, *buf = NULL;
unsigned int x, y;
- u16 *src, *buf;
/*
- * The cma memory is write-combined so reads are uncached.
- * Speed up by fetching one line at a time.
+ * Imported buffers are likely to be write-combined with uncached
+ * reads. Speed up by fetching one line at a time.
+ * prefetch_range() was tried, but didn't give any noticeable speedup
+ * on the Raspberry Pi 1.
*/
- buf = kmalloc(len, GFP_KERNEL);
- if (!buf)
- return;
+ if (drm_gem_fb_get_obj(fb, 0)->import_attach)
+ buf = kmalloc(len, GFP_KERNEL);
for (y = clip->y1; y < clip->y2; y++) {
src = vaddr + (y * fb->pitches[0]);
src += clip->x1;
- memcpy(buf, src, len);
- src = buf;
+ if (buf) {
+ memcpy(buf, src, len);
+ src = buf;
+ }
for (x = clip->x1; x < clip->x2; x++)
*dst++ = swab16(*src++);
}
@@ -155,19 +160,21 @@ void tinydrm_xrgb8888_to_rgb565(u16 *dst, void *vaddr,
struct drm_clip_rect *clip, bool swap)
{
size_t len = (clip->x2 - clip->x1) * sizeof(u32);
+ u32 *src, *buf = NULL;
unsigned int x, y;
- u32 *src, *buf;
u16 val16;
- buf = kmalloc(len, GFP_KERNEL);
- if (!buf)
- return;
+ /* See tinydrm_swab16() for an explanation */
+ if (drm_gem_fb_get_obj(fb, 0)->import_attach)
+ buf = kmalloc(len, GFP_KERNEL);
for (y = clip->y1; y < clip->y2; y++) {
src = vaddr + (y * fb->pitches[0]);
src += clip->x1;
- memcpy(buf, src, len);
- src = buf;
+ if (buf) {
+ memcpy(buf, src, len);
+ src = buf;
+ }
for (x = clip->x1; x < clip->x2; x++) {
val16 = ((*src & 0x00F80000) >> 8) |
((*src & 0x0000FC00) >> 5) |
@@ -205,24 +212,23 @@ void tinydrm_xrgb8888_to_gray8(u8 *dst, void *vaddr, struct drm_framebuffer *fb,
{
unsigned int len = (clip->x2 - clip->x1) * sizeof(u32);
unsigned int x, y;
- void *buf;
+ void *buf = NULL;
u32 *src;
if (WARN_ON(fb->format->format != DRM_FORMAT_XRGB8888))
return;
- /*
- * The cma memory is write-combined so reads are uncached.
- * Speed up by fetching one line at a time.
- */
- buf = kmalloc(len, GFP_KERNEL);
- if (!buf)
- return;
+
+ /* See tinydrm_swab16() for an explanation */
+ if (drm_gem_fb_get_obj(fb, 0)->import_attach)
+ buf = kmalloc(len, GFP_KERNEL);
for (y = clip->y1; y < clip->y2; y++) {
src = vaddr + (y * fb->pitches[0]);
src += clip->x1;
- memcpy(buf, src, len);
- src = buf;
+ if (buf) {
+ memcpy(buf, src, len);
+ src = buf;
+ }
for (x = clip->x1; x < clip->x2; x++) {
u8 r = (*src & 0x00ff0000) >> 16;
u8 g = (*src & 0x0000ff00) >> 8;
--
2.14.2
More information about the dri-devel
mailing list