[PATCH v6 2/2] drm/lima: driver for ARM Mali4xx GPUs

Wed Mar 6 02:51:55 UTC 2019

On Wed, Mar 6, 2019 at 4:16 AM Eric Anholt <eric at anholt.net> wrote:
>
> Qiang Yu <yuq825 at gmail.com> writes:
>
> > - Mali 4xx GPUs have two kinds of processors GP and PP. GP is for
> >   OpenGL vertex shader processing and PP is for fragment shader
> >   processing. Each processor has its own MMU so prcessors work in
> >   virtual address space.
> > - There's only one GP but multiple PP (max 4 for mali 400 and 8
> >   for mali 450) in the same mali 4xx GPU. All PPs are grouped
> >   togather to handle a single fragment shader task divided by
> >   FB output tiled pixels. Mali 400 user space driver is
> >   responsible for assign target tiled pixels to each PP, but mali
> >   450 has a HW module called DLBU to dynamically balance each
> >   PP's load.
> > - User space driver allocate buffer object and map into GPU
> >   virtual address space, upload command stream and draw data with
> >   CPU mmap of the buffer object, then submit task to GP/PP with
> >   a register frame indicating where is the command stream and misc
> >   settings.
> > - There's no command stream validation/relocation due to each user
> >   process has its own GPU virtual address space. GP/PP's MMU switch
> >   virtual address space before running two tasks from different
> >   user process. Error or evil user space code just get MMU fault
> >   or GP/PP error IRQ, then the HW/SW will be recovered.
> > - Use GEM+shmem for MM. Currently just alloc and pin memory when
> >   gem object creation. GPU vm map of the buffer is also done in
> >   the alloc stage in kernel space. We may delay the memory
> >   allocation and real GPU vm map to command submission stage in the
> >   furture as improvement.
> > - Use drm_sched for GPU task schedule. Each OpenGL context should
> >   have a lima context object in the kernel to distinguish tasks
> >   from different user. drm_sched gets task from each lima context
> >   in a fair way.
>
> Given the requirement for open source userspace for new DRM kernel
> drivers, it would be nice to see the link to your open source userspace
> with the submission so we can see how the interfaces get used.  (I know
> I found it once before, but I don't remember now)

The link is in the cover letter of this patch serial, I'll add it too next time:
https://gitlab.freedesktop.org/lima/mesa

>
> However, other than a concern about the _pad fields in the UABI, I'm
> ready to add a reviewed-b.
>
> > --- drivers/gpu/drm/Kconfig | 2 + drivers/gpu/drm/Makefile | 1 +
> > drivers/gpu/drm/lima/Kconfig | 10 + drivers/gpu/drm/lima/Makefile | 21
> > ++ drivers/gpu/drm/lima/lima_bcast.c | 47 +++
> > drivers/gpu/drm/lima/lima_bcast.h | 14 +
> > drivers/gpu/drm/lima/lima_ctx.c | 97 ++++++
> > drivers/gpu/drm/lima/lima_ctx.h | 30 ++
> > drivers/gpu/drm/lima/lima_device.c | 385 +++++++++++++++++++++++
> > drivers/gpu/drm/lima/lima_device.h | 131 ++++++++
> > drivers/gpu/drm/lima/lima_dlbu.c | 58 ++++
> > drivers/gpu/drm/lima/lima_dlbu.h | 18 ++
> > drivers/gpu/drm/lima/lima_drv.c | 366 ++++++++++++++++++++++
> > drivers/gpu/drm/lima/lima_drv.h | 45 +++
> > drivers/gpu/drm/lima/lima_gem.c | 381 +++++++++++++++++++++++
> > drivers/gpu/drm/lima/lima_gem.h | 25 ++
> > drivers/gpu/drm/lima/lima_gem_prime.c | 47 +++
> > drivers/gpu/drm/lima/lima_gem_prime.h | 13 +
> > drivers/gpu/drm/lima/lima_gp.c | 283 +++++++++++++++++
> > drivers/gpu/drm/lima/lima_gp.h | 16 +
> > drivers/gpu/drm/lima/lima_l2_cache.c | 80 +++++
> > drivers/gpu/drm/lima/lima_l2_cache.h | 14 +
> > drivers/gpu/drm/lima/lima_mmu.c | 142 +++++++++
> > drivers/gpu/drm/lima/lima_mmu.h | 16 +
> > drivers/gpu/drm/lima/lima_object.c | 122 ++++++++
> > drivers/gpu/drm/lima/lima_object.h | 36 +++
> > drivers/gpu/drm/lima/lima_pmu.c | 60 ++++
> > drivers/gpu/drm/lima/lima_pmu.h | 12 + drivers/gpu/drm/lima/lima_pp.c
> > | 424 ++++++++++++++++++++++++++ drivers/gpu/drm/lima/lima_pp.h | 19
> > ++ drivers/gpu/drm/lima/lima_regs.h | 298 ++++++++++++++++++
> > drivers/gpu/drm/lima/lima_sched.c | 404 ++++++++++++++++++++++++
> > drivers/gpu/drm/lima/lima_sched.h | 104 +++++++
> > drivers/gpu/drm/lima/lima_vm.c | 282 +++++++++++++++++
> > drivers/gpu/drm/lima/lima_vm.h | 62 ++++ include/uapi/drm/lima_drm.h |
> > 139 +++++++++ 36 files changed, 4204 insertions(+) create mode 100644
> > drivers/gpu/drm/lima/Kconfig create mode 100644
> > drivers/gpu/drm/lima/Makefile create mode 100644
> > drivers/gpu/drm/lima/lima_bcast.c create mode 100644
> > drivers/gpu/drm/lima/lima_bcast.h create mode 100644
> > drivers/gpu/drm/lima/lima_ctx.c create mode 100644
> > drivers/gpu/drm/lima/lima_ctx.h create mode 100644
> > drivers/gpu/drm/lima/lima_device.c create mode 100644
> > drivers/gpu/drm/lima/lima_device.h create mode 100644
> > drivers/gpu/drm/lima/lima_dlbu.c create mode 100644
> > drivers/gpu/drm/lima/lima_dlbu.h create mode 100644
> > drivers/gpu/drm/lima/lima_drv.c create mode 100644
> > drivers/gpu/drm/lima/lima_drv.h create mode 100644
> > drivers/gpu/drm/lima/lima_gem.c create mode 100644
> > drivers/gpu/drm/lima/lima_gem.h create mode 100644
> > drivers/gpu/drm/lima/lima_gem_prime.c create mode 100644
> > drivers/gpu/drm/lima/lima_gem_prime.h create mode 100644
> > drivers/gpu/drm/lima/lima_gp.c create mode 100644
> > drivers/gpu/drm/lima/lima_gp.h create mode 100644
> > drivers/gpu/drm/lima/lima_l2_cache.c create mode 100644
> > drivers/gpu/drm/lima/lima_l2_cache.h create mode 100644
> > drivers/gpu/drm/lima/lima_mmu.c create mode 100644
> > drivers/gpu/drm/lima/lima_mmu.h create mode 100644
> > drivers/gpu/drm/lima/lima_object.c create mode 100644
> > drivers/gpu/drm/lima/lima_object.h create mode 100644
> > drivers/gpu/drm/lima/lima_pmu.c create mode 100644
> > drivers/gpu/drm/lima/lima_pmu.h create mode 100644
> > drivers/gpu/drm/lima/lima_pp.c create mode 100644
> > drivers/gpu/drm/lima/lima_pp.h create mode 100644
> > drivers/gpu/drm/lima/lima_regs.h create mode 100644
> > drivers/gpu/drm/lima/lima_sched.c create mode 100644
> > drivers/gpu/drm/lima/lima_sched.h create mode 100644
> > drivers/gpu/drm/lima/lima_vm.c create mode 100644
> > drivers/gpu/drm/lima/lima_vm.h create mode 100644
> > include/uapi/drm/lima_drm.h
>
> > diff --git a/drivers/gpu/drm/lima/lima_sched.c b/drivers/gpu/drm/lima/lima_sched.c
> > new file mode 100644
> > index 000000000000..606e8aad2a82
> > --- /dev/null
> > +++ b/drivers/gpu/drm/lima/lima_sched.c
> > @@ -0,0 +1,404 @@
> > +// SPDX-License-Identifier: GPL-2.0 OR MIT
> > +/* Copyright 2017-2019 Qiang Yu <yuq825 at gmail.com> */
> > +
> > +#include <linux/kthread.h>
> > +#include <linux/slab.h>
> > +
> > +#include "lima_drv.h"
> > +#include "lima_sched.h"
> > +#include "lima_vm.h"
> > +#include "lima_mmu.h"
> > +#include "lima_l2_cache.h"
> > +#include "lima_object.h"
> > +
> > +struct lima_fence {
> > +     struct dma_fence base;
> > +     struct lima_sched_pipe *pipe;
> > +};
> > +
> > +static struct kmem_cache *lima_fence_slab;
> > +
> > +int lima_sched_slab_init(void)
> > +{
> > +     lima_fence_slab = kmem_cache_create(
> > +             "lima_fence", sizeof(struct lima_fence), 0,
> > +             SLAB_HWCACHE_ALIGN, NULL);
> > +     if (!lima_fence_slab)
> > +             return -ENOMEM;
> > +
> > +     return 0;
> > +}
> > +
> > +void lima_sched_slab_fini(void)
> > +{
> > +     kmem_cache_destroy(lima_fence_slab);
> > +}
> > +
> > +static inline struct lima_fence *to_lima_fence(struct dma_fence *fence)
> > +{
> > +     return container_of(fence, struct lima_fence, base);
> > +}
> > +
> > +static const char *lima_fence_get_driver_name(struct dma_fence *fence)
> > +{
> > +     return "lima";
> > +}
> > +
> > +static const char *lima_fence_get_timeline_name(struct dma_fence *fence)
> > +{
> > +     struct lima_fence *f = to_lima_fence(fence);
> > +
> > +     return f->pipe->base.name;
> > +}
> > +
> > +static bool lima_fence_enable_signaling(struct dma_fence *fence)
> > +{
> > +     return true;
> > +}
> > +
> > +static void lima_fence_release_rcu(struct rcu_head *rcu)
> > +{
> > +     struct dma_fence *f = container_of(rcu, struct dma_fence, rcu);
> > +     struct lima_fence *fence = to_lima_fence(f);
> > +
> > +     kmem_cache_free(lima_fence_slab, fence);
> > +}
> > +
> > +static void lima_fence_release(struct dma_fence *fence)
> > +{
> > +     struct lima_fence *f = to_lima_fence(fence);
> > +
> > +     call_rcu(&f->base.rcu, lima_fence_release_rcu);
> > +}
> > +
> > +static const struct dma_fence_ops lima_fence_ops = {
> > +     .get_driver_name = lima_fence_get_driver_name,
> > +     .get_timeline_name = lima_fence_get_timeline_name,
> > +     .enable_signaling = lima_fence_enable_signaling,
> > +     .wait = dma_fence_default_wait,
> > +     .release = lima_fence_release,
> > +};
>
> You can delete .enable_signaling and .wait now (See 418cc6ca0607
> ("dma-fence: Make ->wait callback optional"))
>
> > diff --git a/include/uapi/drm/lima_drm.h b/include/uapi/drm/lima_drm.h
> > new file mode 100644
> > index 000000000000..705723a64b5f
> > --- /dev/null
> > +++ b/include/uapi/drm/lima_drm.h
> > @@ -0,0 +1,139 @@
> > +/* SPDX-License-Identifier: (GPL-2.0 WITH Linux-syscall-note) OR MIT */
> > +/* Copyright 2017-2018 Qiang Yu <yuq825 at gmail.com> */
> > +
> > +#ifndef __LIMA_DRM_H__
> > +#define __LIMA_DRM_H__
> > +
> > +#include "drm.h"
> > +
> > +#if defined(__cplusplus)
> > +extern "C" {
> > +#endif
> > +
> > +enum drm_lima_param_gpu_id {
> > +     DRM_LIMA_PARAM_GPU_ID_UNKNOWN,
> > +     DRM_LIMA_PARAM_GPU_ID_MALI400,
> > +     DRM_LIMA_PARAM_GPU_ID_MALI450,
> > +};
> > +
> > +enum drm_lima_param {
> > +     DRM_LIMA_PARAM_GPU_ID,
> > +     DRM_LIMA_PARAM_NUM_PP,
> > +     DRM_LIMA_PARAM_GP_VERSION,
> > +     DRM_LIMA_PARAM_PP_VERSION,
> > +};
> > +
> > +struct drm_lima_get_param {
> > +     __u32 param; /* in */
> > +     __u32 pad;
> > +     __u64 value; /* out */
> > +};
> > +
> > +struct drm_lima_gem_create {
> > +     __u32 size;    /* in */
> > +     __u32 flags;   /* in */
> > +     __u32 handle;  /* out */
> > +     __u32 pad;
> > +};
>
> Might be nice to pass the offset back out from create like I did in v3d.
> It's convenient to not need an immediate GEM_INFO.  Totally optional
> suggestion, though.

I thought this way, but gem_info can't be removed even embedded
into gem_create due to dmabuf import case, and better to keep info
in a single place so didn't do this. But indeed immediate gem_info
is a wast of syscall.

>
> > +
> > +struct drm_lima_gem_info {
> > +     __u32 handle;  /* in */
> > +     __u32 va;      /* out */
> > +     __u64 offset;  /* out */
> > +};
> > +
> > +#define LIMA_SUBMIT_BO_READ   0x01
> > +#define LIMA_SUBMIT_BO_WRITE  0x02
> > +
> > +struct drm_lima_gem_submit_bo {
> > +     __u32 handle;  /* in */
> > +     __u32 flags;   /* in */
> > +};
> > +
> > +#define LIMA_GP_FRAME_REG_NUM 6
> > +
> > +struct drm_lima_gp_frame {
> > +     __u32 frame[LIMA_GP_FRAME_REG_NUM];
> > +};
> > +
> > +#define LIMA_PP_FRAME_REG_NUM 23
> > +#define LIMA_PP_WB_REG_NUM 12
> > +
> > +struct drm_lima_m400_pp_frame {
> > +     __u32 frame[LIMA_PP_FRAME_REG_NUM];
> > +     __u32 num_pp;
> > +     __u32 wb[3 * LIMA_PP_WB_REG_NUM];
> > +     __u32 plbu_array_address[4];
> > +     __u32 fragment_stack_address[4];
> > +};
> > +
> > +struct drm_lima_m450_pp_frame {
> > +     __u32 frame[LIMA_PP_FRAME_REG_NUM];
> > +     __u32 num_pp;
> > +     __u32 wb[3 * LIMA_PP_WB_REG_NUM];
> > +     __u32 use_dlbu;
> > +     __u32 _pad;
> > +     union {
> > +             __u32 plbu_array_address[8];
> > +             __u32 dlbu_regs[4];
> > +     };
> > +     __u32 fragment_stack_address[8];
> > +};
> > +
> > +#define LIMA_PIPE_GP  0x00
> > +#define LIMA_PIPE_PP  0x01
> > +
> > +#define LIMA_SUBMIT_FLAG_EXPLICIT_FENCE (1 << 0)
> > +
> > +struct drm_lima_gem_submit {
> > +     __u32 ctx;         /* in */
> > +     __u32 pipe;        /* in */
> > +     __u32 nr_bos;      /* in */
> > +     __u32 frame_size;  /* in */
> > +     __u64 bos;         /* in */
> > +     __u64 frame;       /* in */
> > +     __u32 flags;       /* in */
> > +     __u32 out_sync;    /* in */
> > +     __u32 in_sync[2];  /* in */
> > +};
> > +
> > +#define LIMA_GEM_WAIT_READ   0x01
> > +#define LIMA_GEM_WAIT_WRITE  0x02
> > +
> > +struct drm_lima_gem_wait {
> > +     __u32 handle;      /* in */
> > +     __u32 op;          /* in */
> > +     __s64 timeout_ns;  /* in */
>
> Add a comment that it's an absolute ns?
>
> > +};
> > +
> > +struct drm_lima_ctx_create {
> > +     __u32 id;          /* out */
> > +     __u32 _pad;
> > +};
> > +
> > +struct drm_lima_ctx_free {
> > +     __u32 id;          /* in */
> > +     __u32 _pad;
> > +};
>
> I don't think you need the _pad fields here, and they're actually a bad
> idea because the lack of checking in your ioctls means you can't trust
> that userspace has initialized them to 0 when you want to redefine them
> as a flags field later.

Could I drop the _pad? I thought there is a rule that all drm ioctl arg size
should be kept 64bit, is there?

Regards,
Qiang