[PATCH 4/4] drm/mgag200: Use DMA to copy the framebuffer to the VRAM

Tue May 9 09:49:52 UTC 2023

On 08/05/2023 10:04, Thomas Zimmermann wrote:
> Hi Jocelyn
> 
> Am 05.05.23 um 14:43 schrieb Jocelyn Falempe:
>> Even if the transfer is not faster, it brings significant
>> improvement in latencies and CPU usage.
>>
> 
> I think I shot down this patch already. I don't want to maintain the 
> extra code for DMA support.

Yes, thank you for still taking a look at it !
I will also be there to fix regressions, if any.
> 
> 
>> CPU usage drops from 100% of one core to 3% when continuously
>> refreshing the screen.
> 
> But this result is nice, so mabe I'd reconsider. Let's take this patch 
> as an RFC to discuss possible ways forward.
> 
> There seems to be some fallback still in place in this patch. I 
> definately don't want multiple codepaths for copying; so the DMA code 
> needs to work on all our hardware.

As it is new code, and I can't test on every hardware, I let a fallback 
in case it breaks on some configuration. But I agree, it can be DMA 
only, as all G200 should work with this code.

> 
>>
>> The primary DMA is used to send commands (register write), and
>> the secondary DMA to send the pixel data.
>> It uses the ILOAD command (chapter 4.5.7 in G200 specification),
>> which allows to load an image, or a part of an image from system
>> memory to VRAM.
>> The last command sent, is a softrap command, which triggers an IRQ
>> when the DMA transfer is complete.
>> For 16bits and 24bits pixel width, each line is padded to make sure,
>> the DMA transfer is a multiple of 32bits. The padded bytes won't be
>> drawn on the screen, so they don't need to be initialized.
>>
>> Signed-off-by: Jocelyn Falempe <jfalempe at redhat.com>
>> ---
>>   drivers/gpu/drm/mgag200/Makefile       |   3 +-
>>   drivers/gpu/drm/mgag200/mgag200_dma.c  | 114 +++++++++++++++++++++
>>   drivers/gpu/drm/mgag200/mgag200_drv.c  |   2 +
>>   drivers/gpu/drm/mgag200/mgag200_drv.h  |  25 +++++
>>   drivers/gpu/drm/mgag200/mgag200_mode.c | 131 ++++++++++++++++++++++++-
>>   drivers/gpu/drm/mgag200/mgag200_reg.h  |  25 +++++
>>   6 files changed, 295 insertions(+), 5 deletions(-)
>>   create mode 100644 drivers/gpu/drm/mgag200/mgag200_dma.c
>>
>> diff --git a/drivers/gpu/drm/mgag200/Makefile 
>> b/drivers/gpu/drm/mgag200/Makefile
>> index 182e224c460d..96e23dc5572c 100644
>> --- a/drivers/gpu/drm/mgag200/Makefile
>> +++ b/drivers/gpu/drm/mgag200/Makefile
>> @@ -11,6 +11,7 @@ mgag200-y := \
>>       mgag200_g200se.o \
>>       mgag200_g200wb.o \
>>       mgag200_i2c.o \
>> -    mgag200_mode.o
>> +    mgag200_mode.o \
>> +    mgag200_dma.o
>>   obj-$(CONFIG_DRM_MGAG200) += mgag200.o
>> diff --git a/drivers/gpu/drm/mgag200/mgag200_dma.c 
>> b/drivers/gpu/drm/mgag200/mgag200_dma.c
>> new file mode 100644
>> index 000000000000..fe063fa2b5f1
>> --- /dev/null
>> +++ b/drivers/gpu/drm/mgag200/mgag200_dma.c
>> @@ -0,0 +1,114 @@
>> +// SPDX-License-Identifier: GPL-2.0-only
>> +/*
>> + * Copyright 2023 Red Hat
>> + *
>> + * Authors: Jocelyn Falempe
>> + *
>> + */
>> +
>> +#include <linux/wait.h>
>> +#include <linux/dma-mapping.h>
>> +
>> +#include "mgag200_drv.h"
>> +#include "mgag200_reg.h"
>> +
>> +/* Maximum number of command block for one DMA transfert
>> + * iload should only use 4 blocks
>> + */
>> +#define MGA_MAX_CMD        50
>> +
>> +#define MGA_DMA_SIZE        (4 * 1024 * 1024)
>> +#define MGA_MIN_DMA_SIZE    (64 * 1024)
>> +
>> +/*
>> + * Allocate coherent buffers for DMA transfert.
>> + * We need two buffers, one for the commands, and one for the data.
>> + * If allocation fails, mdev->dma_buf will be NULL, and the driver will
>> + * fallback to memcpy().
>> + */
>> +void mgag200_dma_allocate(struct mga_device *mdev)
>> +{
>> +    struct drm_device *dev = &mdev->base;
>> +    int size;
>> +    /* Allocate the command buffer */
>> +    mdev->cmd = dmam_alloc_coherent(dev->dev, MGA_MAX_CMD * 
>> sizeof(*mdev->cmd),
>> +                    &mdev->cmd_handle, GFP_KERNEL);
>> +
>> +    if (!mdev->cmd) {
>> +        drm_warn(dev, "DMA command buffer allocation failed, fallback 
>> to cpu copy\n");
>> +        return;
>> +    }
>> +
>> +    for (size = MGA_DMA_SIZE; size >= MGA_MIN_DMA_SIZE; size = size 
>> >> 1) {
>> +        mdev->dma_buf = dmam_alloc_coherent(dev->dev, size, 
>> &mdev->dma_handle, GFP_KERNEL);
>> +        if (mdev->dma_buf)
>> +            break;
>> +    }
>> +    if (!mdev->dma_buf) {
>> +        drm_warn(dev, "DMA data buffer allocation failed, fallback to 
>> cpu copy\n");
>> +        return;
>> +    }
>> +    mdev->dma_size = size;
>> +    drm_info(dev, "Using DMA with a %dk data buffer\n", size / 1024);
>> +}
>> +
>> +/*
>> + * Matrox uses commands block to program the hardware through DMA.
>> + * Each command is a register write, and each block contains 4 commands
>> + * packed in 5 words(u32).
>> + * First word is the 4 register index (8bit) to write for the 4 
>> commands,
>> + * followed by the four values to be written.
>> + */
>> +void mgag200_dma_add_block(struct mga_device *mdev,
>> +               u32 reg0, u32 val0,
>> +               u32 reg1, u32 val1,
>> +               u32 reg2, u32 val2,
>> +               u32 reg3, u32 val3)
>> +{
>> +    if (mdev->cmd_idx >= MGA_MAX_CMD) {
>> +        pr_err("mgag200: exceeding the DMA command buffer size\n");
>> +        return;
>> +    }
>> +
>> +    mdev->cmd[mdev->cmd_idx] = (struct mga_cmd_block) {
>> +        .cmd = DMAREG(reg0) | DMAREG(reg1) << 8 | DMAREG(reg2) << 16 
>> | DMAREG(reg3) << 24,
>> +        .v0 = val0,
>> +        .v1 = val1,
>> +        .v2 = val2,
>> +        .v3 = val3};
>> +    mdev->cmd_idx++;
>> +}
>> +
>> +void mgag200_dma_run_cmd(struct mga_device *mdev)
>> +{
>> +    struct drm_device *dev = &mdev->base;
>> +    u32 primend;
>> +    int ret;
>> +
>> +    /* Add a last block to trigger the softrap interrupt */
>> +    mgag200_dma_add_block(mdev,
>> +            MGAREG_DMAPAD, 0,
>> +            MGAREG_DMAPAD, 0,
>> +            MGAREG_DMAPAD, 0,
>> +            MGAREG_SOFTRAP, 0);
>> +
>> +    primend = mdev->cmd_handle + mdev->cmd_idx * sizeof(struct 
>> mga_cmd_block);
>> +
>> +    // Use primary DMA to send the commands
>> +    WREG32(MGAREG_PRIMADDR, (u32) mdev->cmd_handle);
>> +    WREG32(MGAREG_PRIMEND, primend);
>> +    mdev->dma_in_use = 1;
>> +
>> +    ret = wait_event_timeout(mdev->waitq, mdev->dma_in_use == 0, HZ);
>> +
>> +    if (mdev->dma_in_use) {
>> +        drm_err(dev, "DMA transfert timed out\n");
>> +        /* something goes wrong, reset the DMA engine */
>> +        WREG8(MGAREG_OPMODE, MGAOPM_DMA_BLIT);
>> +        mdev->dma_in_use = 0;
>> +    }
>> +
>> +    /* reset command index to start a new sequence */
>> +    mdev->cmd_idx = 0;
>> +}
> 
> Can the DMA code be build around Linux' struct drm_device?

I just took a look at which field of the struct drm_device I can use 
there. "struct drm_device_dma * dma" is defined in drm_legacy.h, does 
that mean I shouldn't use it for a new work ?

> 
>> +
>> diff --git a/drivers/gpu/drm/mgag200/mgag200_drv.c 
>> b/drivers/gpu/drm/mgag200/mgag200_drv.c
>> index 3566fcdfe1e4..c863487526e7 100644
>> --- a/drivers/gpu/drm/mgag200/mgag200_drv.c
>> +++ b/drivers/gpu/drm/mgag200/mgag200_drv.c
>> @@ -184,6 +184,8 @@ int mgag200_device_preinit(struct mga_device *mdev)
>>       if (!mdev->vram)
>>           return -ENOMEM;
>> +    mgag200_dma_allocate(mdev);
>> +
>>       mgag200_init_irq(mdev);
>>       ret = devm_request_irq(dev->dev, pdev->irq, 
>> mgag200_driver_irq_handler,
>>                      IRQF_SHARED, "mgag200_irq", mdev);
>> diff --git a/drivers/gpu/drm/mgag200/mgag200_drv.h 
>> b/drivers/gpu/drm/mgag200/mgag200_drv.h
>> index 02175bfaf5a8..f5060fdc16f9 100644
>> --- a/drivers/gpu/drm/mgag200/mgag200_drv.h
>> +++ b/drivers/gpu/drm/mgag200/mgag200_drv.h
>> @@ -277,6 +277,14 @@ struct mgag200_device_funcs {
>>       void (*pixpllc_atomic_update)(struct drm_crtc *crtc, struct 
>> drm_atomic_state *old_state);
>>   };
>> +struct mga_cmd_block {
>> +    u32 cmd;
>> +    u32 v0;
>> +    u32 v1;
>> +    u32 v2;
>> +    u32 v3;
>> +} __packed;
>> +
>>   struct mga_device {
>>       struct drm_device base;
>> @@ -291,6 +299,14 @@ struct mga_device {
>>       void __iomem            *vram;
>>       resource_size_t            vram_available;
>> +    void *dma_buf;
>> +    size_t dma_size;
>> +    dma_addr_t dma_handle;
>> +
>> +    struct mga_cmd_block *cmd;
>> +    int cmd_idx;
>> +    dma_addr_t cmd_handle;
>> +
>>       wait_queue_head_t waitq;
>>       int dma_in_use;
>> @@ -446,4 +462,13 @@ void mgag200_bmc_enable_vidrst(struct mga_device 
>> *mdev);
>>                   /* mgag200_i2c.c */
>>   int mgag200_i2c_init(struct mga_device *mdev, struct mga_i2c_chan 
>> *i2c);
>> +/* mgag200_dma.c */
>> +void mgag200_dma_allocate(struct mga_device *mdev);
>> +void mgag200_dma_add_block(struct mga_device *mdev,
>> +               u32 reg0, u32 val0,
>> +               u32 reg1, u32 val1,
>> +               u32 reg2, u32 val2,
>> +               u32 reg3, u32 val3);
>> +void mgag200_dma_run_cmd(struct mga_device *mdev);
>> +
>>   #endif                /* __MGAG200_DRV_H__ */
>> diff --git a/drivers/gpu/drm/mgag200/mgag200_mode.c 
>> b/drivers/gpu/drm/mgag200/mgag200_mode.c
>> index 7d8c65372ac4..7825ec4323d2 100644
>> --- a/drivers/gpu/drm/mgag200/mgag200_mode.c
>> +++ b/drivers/gpu/drm/mgag200/mgag200_mode.c
>> @@ -398,13 +398,132 @@ static void mgag200_disable_display(struct 
>> mga_device *mdev)
>>       WREG_ECRT(0x01, crtcext1);
>>   }
>> +static void mgag200_dwg_setup(struct mga_device *mdev, struct 
>> drm_framebuffer *fb)
>> +{
>> +    u32 maccess;
>> +
>> +    drm_dbg(&mdev->base, "Setup DWG with %dx%d %p4cc\n",
>> +        fb->width, fb->height, &fb->format->format);
>> +
>> +    switch (fb->format->format) {
>> +    case DRM_FORMAT_RGB565:
>> +        maccess = MGAMAC_PW16;
>> +        break;
>> +    case DRM_FORMAT_RGB888:
>> +        maccess = MGAMAC_PW24;
>> +        break;
>> +    case DRM_FORMAT_XRGB8888:
>> +        maccess = MGAMAC_PW32;
>> +        break;
>> +    }
>> +    WREG32(MGAREG_MACCESS, maccess);
>> +
>> +    /* Framebuffer width in pixel */
>> +    WREG32(MGAREG_PITCH, fb->width);
>> +
>> +    /* Sane default value for the drawing engine registers */
>> +    WREG32(MGAREG_DSTORG, 0);
>> +    WREG32(MGAREG_YDSTORG, 0);
>> +    WREG32(MGAREG_SRCORG, 0);
>> +    WREG32(MGAREG_CXBNDRY, 0x0FFF0000);
>> +    WREG32(MGAREG_YTOP, 0);
>> +    WREG32(MGAREG_YBOT, 0x00FFFFFF);
>> +
>> +    /* Activate blit mode DMA, only write the low part of the 
>> register */
>> +    WREG8(MGAREG_OPMODE, MGAOPM_DMA_BLIT);
>> +}
>> +
>> +/*
>> + * ILOAD allows to load an image from system memory to the VRAM, and 
>> with FXBNDRY, YDST and YDSTLEN,
>> + * you can transfert a rectangle, so it's perfect when used with a 
>> damage clip.
>> + */
>> +static void mgag200_iload_cmd(struct mga_device *mdev, int x, int y, 
>> int width, int height,
>> +                  int width_padded, int cpp)
>> +{
>> +    int size = width_padded * height;
>> +    u32 iload;
>> +
>> +    iload = MGADWG_ILOAD | MGADWG_SGNZERO | MGADWG_SHIFTZERO | 
>> MGADWG_REPLACE | MGADWG_CLIPDIS
>> +        | MGADWG_BFCOL;
>> +
>> +    mgag200_dma_add_block(mdev,
>> +        MGAREG_DWGCTL, iload,
>> +        MGAREG_FXBNDRY, (((x + width - 1) << 16) | x),
>> +        MGAREG_AR0, (width_padded / cpp) - 1,
>> +        MGAREG_AR3, 0);
>> +
>> +    mgag200_dma_add_block(mdev,
>> +        MGAREG_AR5, 0,
>> +        MGAREG_YDST, y,
>> +        MGAREG_DMAPAD, 0,
>> +        MGAREG_DMAPAD, 0);
>> +
>> +    mgag200_dma_add_block(mdev,
>> +        MGAREG_DMAPAD, 0,
>> +        MGAREG_LEN | MGAREG_EXEC, height,
>> +        MGAREG_SECADDR, mdev->dma_handle | 1,
>> +        /* Writing SECEND should always be the last command of a 
>> block */
>> +        MGAREG_SECEND, mdev->dma_handle + size);
>> +}
>> +
>> +static void mgag200_dma_copy(struct mga_device *mdev, const void 
>> *src, u32 pitch,
>> +                struct drm_rect *clip, int cpp)
>> +{
>> +    int i;
>> +    int width = drm_rect_width(clip);
>> +    int height = drm_rect_height(clip);
>> +
>> +    /* pad each line to 32bits boundaries see section 4.5.7 of G200 
>> Specification */
>> +    int width_padded = round_up(width * cpp, 4);
>> +
>> +    for (i = 0; i < height; i++)
>> +        memcpy(mdev->dma_buf + width_padded * i,
>> +               src + (((clip->y1 + i) * pitch) + clip->x1 * cpp),
>> +               width * cpp);
> 
> This memcpy() should probably go away.  Instead of SHMEM, mgag200 should 
> allocate with GEM DMA helpers. We have a number of drivers that use DMA 
> helpers with DRM format helpers, so the conversion should be 
> strait-forward and can be done independently from the other patches.

This is something I tried to do.
It will also make the driver a bit more complex, because when the damage 
rectangle is not the full screen, I will need to do one ILOAD per line, 
instead of one for the whole rectangle, but that's still manageable.
Do you know which driver I can take as an example ?

> 
> Afterward, it should be possible to DMA-copy directly from the GEM 
> buffer object.
> 
>> +
>> +    mgag200_iload_cmd(mdev, clip->x1, clip->y1, width, height, 
>> width_padded, cpp);
>> +    mgag200_dma_run_cmd(mdev);
>> +}
>> +
>> +/*
>> + * If the DMA coherent buffer is smaller than damage rectangle, we 
>> need to
>> + * split it into multiple DMA transfert.
>> + */
>> +static void mgag200_dma_damage(struct mga_device *mdev, const struct 
>> iosys_map *vmap,
>> +                   struct drm_framebuffer *fb, struct drm_rect *clip)
>> +{
>> +    u32 pitch = fb->pitches[0];
>> +    const void *src = vmap[0].vaddr;
>> +    struct drm_rect subclip;
>> +    int y1;
>> +    int lines;
>> +    int cpp = fb->format->cpp[0];
>> +
>> +    /* Number of lines that fits in one DMA buffer */
>> +    lines = min(drm_rect_height(clip), (int) mdev->dma_size / 
>> (drm_rect_width(clip) * cpp));
>> +
>> +    subclip.x1 = clip->x1;
>> +    subclip.x2 = clip->x2;
>> +
>> +    for (y1 = clip->y1; y1 < clip->y2; y1 += lines) {
>> +        subclip.y1 = y1;
>> +        subclip.y2 = min(clip->y2, y1 + lines);
>> +        mgag200_dma_copy(mdev, src, pitch, &subclip, cpp);
>> +    }
>> +}
>> +
>>   static void mgag200_handle_damage(struct mga_device *mdev, const 
>> struct iosys_map *vmap,
>>                     struct drm_framebuffer *fb, struct drm_rect *clip)
>>   {
>> -    struct iosys_map dst = IOSYS_MAP_INIT_VADDR_IOMEM(mdev->vram);
>> -
>> -    iosys_map_incr(&dst, drm_fb_clip_offset(fb->pitches[0], 
>> fb->format, clip));
>> -    drm_fb_memcpy(&dst, fb->pitches, vmap, fb, clip);
>> +    if (mdev->dma_buf) {
>> +        /* Fast path, use DMA */
>> +        mgag200_dma_damage(mdev, vmap, fb, clip);
>> +    } else {
>> +        struct iosys_map dst = IOSYS_MAP_INIT_VADDR_IOMEM(mdev->vram);
>> +
>> +        iosys_map_incr(&dst, drm_fb_clip_offset(fb->pitches[0], 
>> fb->format, clip));
>> +        drm_fb_memcpy(&dst, fb->pitches, vmap, fb, clip);
>> +    }
> 
> This branching needs to go. As I said, DMA is either the standard, or 
> not there at all. I doubt that the !mdev->dmabuf case can easily happen 
> in practice. AFAICT it's all in the 32-bit address range and we're on 
> systems with enough physical memory.

I put that in place because on my system, I can't allocate more than 4MB 
with dmam_alloc_coherent(). (And my framebuffer is 5MB, so my first 
attempt was unsuccessful).
I'm not sure what is the reasonable amount of dma memory that we are 
almost guarantee to allocate.

> 
> Best regards
> Thomas
> 
>>   }
>>   /*
>> @@ -475,6 +594,10 @@ void 
>> mgag200_primary_plane_helper_atomic_update(struct drm_plane *plane,
>>       if (!fb)
>>           return;
>> +    if (!old_plane_state->fb || fb->format != 
>> old_plane_state->fb->format
>> +        || fb->width != old_plane_state->fb->width)
>> +        mgag200_dwg_setup(mdev, fb);
>> +
>>       drm_atomic_helper_damage_iter_init(&iter, old_plane_state, 
>> plane_state);
>>       drm_atomic_for_each_plane_damage(&iter, &damage) {
>>           mgag200_handle_damage(mdev, shadow_plane_state->data, fb, 
>> &damage);
>> diff --git a/drivers/gpu/drm/mgag200/mgag200_reg.h 
>> b/drivers/gpu/drm/mgag200/mgag200_reg.h
>> index 748c8e18e938..256ac92dae56 100644
>> --- a/drivers/gpu/drm/mgag200/mgag200_reg.h
>> +++ b/drivers/gpu/drm/mgag200/mgag200_reg.h
>> @@ -116,6 +116,9 @@
>>   #define    MGAREG_OPMODE        0x1e54
>> +#define MGAREG_PRIMADDR        0x1e58
>> +#define MGAREG_PRIMEND        0x1e5c
>> +
>>   /* Warp Registers */
>>   #define MGAREG_WIADDR           0x1dc0
>>   #define MGAREG_WIADDR2          0x1dd8
>> @@ -200,6 +203,8 @@
>>   /* See table on 4-43 for bop ALU operations */
>> +#define MGADWG_REPLACE    (0xC << 16)
>> +
>>   /* See table on 4-44 for translucidity masks */
>>   #define MGADWG_BMONOLEF        ( 0x00 << 25 )
>> @@ -218,6 +223,8 @@
>>   #define MGADWG_PATTERN        ( 0x01 << 29 )
>>   #define MGADWG_TRANSC        ( 0x01 << 30 )
>> +#define MGADWG_CLIPDIS        ( 0x01 << 31 )
>> +
>>   #define MGAREG_MISC_WRITE    0x3c2
>>   #define MGAREG_MISC_READ    0x3cc
>>   #define MGAREG_MEM_MISC_WRITE       0x1fc2
>> @@ -605,6 +612,9 @@
>>   #    define MGA_TC2_SELECT_TMU1                 (0x80000000)
>>   #define MGAREG_TEXTRANS        0x2c34
>>   #define MGAREG_TEXTRANSHIGH    0x2c38
>> +#define MGAREG_SECADDR        0x2c40
>> +#define MGAREG_SECEND        0x2c44
>> +#define MGAREG_SOFTRAP        0x2c48
>>   #define MGAREG_TEXFILTER    0x2c58
>>   #    define MGA_MIN_NRST                        (0x00000000)
>>   #    define MGA_MIN_BILIN                       (0x00000002)
>> @@ -691,4 +701,19 @@
>>   #define MGA_AGP2XPLL_ENABLE        0x1
>>   #define MGA_AGP2XPLL_DISABLE        0x0
>> +
>> +#define DWGREG0        0x1c00
>> +#define DWGREG0_END    0x1dff
>> +#define DWGREG1        0x2c00
>> +#define DWGREG1_END    0x2dff
>> +
>> +/* These macros convert register address to the 8 bit command index 
>> used with DMA
>> + * It remaps 0x1c00-0x1dff to 0x00-0x7f (REG0)
>> + * and 0x2c00-0x2dff to 0x80-0xff (REG1)
>> + */
>> +#define ISREG0(r)    (r >= DWGREG0 && r <= DWGREG0_END)
>> +#define DMAREG0(r)    ((u8)((r - DWGREG0) >> 2))
>> +#define DMAREG1(r)    ((u8)(((r - DWGREG1) >> 2) | 0x80))
>> +#define DMAREG(r)    (ISREG0((r)) ? DMAREG0((r)) : DMAREG1((r)))
>> +
>>   #endif
> 

Best regards,

-- 

Jocelyn