[PATCH 1/8] drm/amdgpu: UAPI for user queue management

Tue Feb 7 14:36:11 UTC 2023

On 07/02/2023 15:20, Alex Deucher wrote:
> On Tue, Feb 7, 2023 at 9:19 AM Christian König <christian.koenig at amd.com> wrote:
>> Am 07.02.23 um 15:17 schrieb Alex Deucher:
>>> On Tue, Feb 7, 2023 at 9:11 AM Christian König
>>> <ckoenig.leichtzumerken at gmail.com> wrote:
>>>> Am 07.02.23 um 15:07 schrieb Alex Deucher:
>>>>> On Tue, Feb 7, 2023 at 2:38 AM Shashank Sharma <shashank.sharma at amd.com> wrote:
>>>>>> On 07/02/2023 08:03, Christian König wrote:
>>>>>>> Am 06.02.23 um 22:03 schrieb Alex Deucher:
>>>>>>>> On Mon, Feb 6, 2023 at 12:01 PM Christian König
>>>>>>>> <christian.koenig at amd.com> wrote:
>>>>>>>>> Am 06.02.23 um 17:56 schrieb Alex Deucher:
>>>>>>>>>> On Fri, Feb 3, 2023 at 5:26 PM Shashank Sharma
>>>>>>>>>> <shashank.sharma at amd.com> wrote:
>>>>>>>>>>> Hey Alex,
>>>>>>>>>>>
>>>>>>>>>>> On 03/02/2023 23:07, Alex Deucher wrote:
>>>>>>>>>>>> On Fri, Feb 3, 2023 at 4:54 PM Shashank Sharma
>>>>>>>>>>>> <shashank.sharma at amd.com> wrote:
>>>>>>>>>>>>> From: Alex Deucher <alexander.deucher at amd.com>
>>>>>>>>>>>>>
>>>>>>>>>>>>> This patch intorduces new UAPI/IOCTL for usermode graphics
>>>>>>>>>>>>> queue. The userspace app will fill this structure and request
>>>>>>>>>>>>> the graphics driver to add a graphics work queue for it. The
>>>>>>>>>>>>> output of this UAPI is a queue id.
>>>>>>>>>>>>>
>>>>>>>>>>>>> This UAPI maps the queue into GPU, so the graphics app can start
>>>>>>>>>>>>> submitting work to the queue as soon as the call returns.
>>>>>>>>>>>>>
>>>>>>>>>>>>> Cc: Alex Deucher <alexander.deucher at amd.com>
>>>>>>>>>>>>> Cc: Christian Koenig <christian.koenig at amd.com>
>>>>>>>>>>>>> Signed-off-by: Alex Deucher <alexander.deucher at amd.com>
>>>>>>>>>>>>> Signed-off-by: Shashank Sharma <shashank.sharma at amd.com>
>>>>>>>>>>>>> ---
>>>>>>>>>>>>>        include/uapi/drm/amdgpu_drm.h | 53
>>>>>>>>>>>>> +++++++++++++++++++++++++++++++++++
>>>>>>>>>>>>>        1 file changed, 53 insertions(+)
>>>>>>>>>>>>>
>>>>>>>>>>>>> diff --git a/include/uapi/drm/amdgpu_drm.h
>>>>>>>>>>>>> b/include/uapi/drm/amdgpu_drm.h
>>>>>>>>>>>>> index 4038abe8505a..6c5235d107b3 100644
>>>>>>>>>>>>> --- a/include/uapi/drm/amdgpu_drm.h
>>>>>>>>>>>>> +++ b/include/uapi/drm/amdgpu_drm.h
>>>>>>>>>>>>> @@ -54,6 +54,7 @@ extern "C" {
>>>>>>>>>>>>>        #define DRM_AMDGPU_VM                  0x13
>>>>>>>>>>>>>        #define DRM_AMDGPU_FENCE_TO_HANDLE     0x14
>>>>>>>>>>>>>        #define DRM_AMDGPU_SCHED               0x15
>>>>>>>>>>>>> +#define DRM_AMDGPU_USERQ               0x16
>>>>>>>>>>>>>
>>>>>>>>>>>>>        #define DRM_IOCTL_AMDGPU_GEM_CREATE
>>>>>>>>>>>>> DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_GEM_CREATE, union
>>>>>>>>>>>>> drm_amdgpu_gem_create)
>>>>>>>>>>>>>        #define DRM_IOCTL_AMDGPU_GEM_MMAP DRM_IOWR(DRM_COMMAND_BASE
>>>>>>>>>>>>> + DRM_AMDGPU_GEM_MMAP, union drm_amdgpu_gem_mmap)
>>>>>>>>>>>>> @@ -71,6 +72,7 @@ extern "C" {
>>>>>>>>>>>>>        #define DRM_IOCTL_AMDGPU_VM DRM_IOWR(DRM_COMMAND_BASE +
>>>>>>>>>>>>> DRM_AMDGPU_VM, union drm_amdgpu_vm)
>>>>>>>>>>>>>        #define DRM_IOCTL_AMDGPU_FENCE_TO_HANDLE
>>>>>>>>>>>>> DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_FENCE_TO_HANDLE, union
>>>>>>>>>>>>> drm_amdgpu_fence_to_handle)
>>>>>>>>>>>>>        #define DRM_IOCTL_AMDGPU_SCHED DRM_IOW(DRM_COMMAND_BASE +
>>>>>>>>>>>>> DRM_AMDGPU_SCHED, union drm_amdgpu_sched)
>>>>>>>>>>>>> +#define DRM_IOCTL_AMDGPU_USERQ DRM_IOW(DRM_COMMAND_BASE +
>>>>>>>>>>>>> DRM_AMDGPU_USERQ, union drm_amdgpu_userq)
>>>>>>>>>>>>>
>>>>>>>>>>>>>        /**
>>>>>>>>>>>>>         * DOC: memory domains
>>>>>>>>>>>>> @@ -302,6 +304,57 @@ union drm_amdgpu_ctx {
>>>>>>>>>>>>>               union drm_amdgpu_ctx_out out;
>>>>>>>>>>>>>        };
>>>>>>>>>>>>>
>>>>>>>>>>>>> +/* user queue IOCTL */
>>>>>>>>>>>>> +#define AMDGPU_USERQ_OP_CREATE 1
>>>>>>>>>>>>> +#define AMDGPU_USERQ_OP_FREE   2
>>>>>>>>>>>>> +
>>>>>>>>>>>>> +#define AMDGPU_USERQ_MQD_FLAGS_SECURE  (1 << 0)
>>>>>>>>>>>>> +#define AMDGPU_USERQ_MQD_FLAGS_AQL     (1 << 1)
>>>>>>>>>>>>> +
>>>>>>>>>>>>> +struct drm_amdgpu_userq_mqd {
>>>>>>>>>>>>> +       /** Flags: AMDGPU_USERQ_MQD_FLAGS_* */
>>>>>>>>>>>>> +       __u32   flags;
>>>>>>>>>>>>> +       /** IP type: AMDGPU_HW_IP_* */
>>>>>>>>>>>>> +       __u32   ip_type;
>>>>>>>>>>>>> +       /** GEM object handle */
>>>>>>>>>>>>> +       __u32   doorbell_handle;
>>>>>>>>>>>>> +       /** Doorbell offset in dwords */
>>>>>>>>>>>>> +       __u32   doorbell_offset;
>>>>>>>>>>>> Since doorbells are 64 bit, maybe this offset should be in qwords.
>>>>>>>>>>> Can you please help to cross check this information ? All the
>>>>>>>>>>> existing
>>>>>>>>>>> kernel doorbell calculations are keeping doorbells size as
>>>>>>>>>>> sizeof(u32)
>>>>>>>>>> Doorbells on pre-vega hardware are 32 bits so that is where that comes
>>>>>>>>>> from, but from vega onward most doorbells are 64 bit.  I think some
>>>>>>>>>> versions of VCN may still use 32 bit doorbells.  Internally in the
>>>>>>>>>> kernel driver we just use two slots for newer hardware, but for the
>>>>>>>>>> UAPI, I think we can just stick with 64 bit slots to avoid confusion.
>>>>>>>>>> Even if an engine only uses a 32 bit one, I don't know that there is
>>>>>>>>>> much value to trying to support variable doorbell sizes.
>>>>>>>>> I think we can stick with using __u32 because this is *not* the size of
>>>>>>>>> the doorbell entries.
>>>>>>>>>
>>>>>>>>> Instead this is the offset into the BO where to find the doorbell for
>>>>>>>>> this queue (which then in turn is 64bits wide).
>>>>>>>>>
>>>>>>>>> Since we will probably never have more than 4GiB doorbells we should be
>>>>>>>>> pretty save to use 32bits here.
>>>>>>>> Yes, the offset would still be 32 bits, but the units would be
>>>>>>>> qwords.  E.g.,
>>>>>>>>
>>>>>>>> +       /** Doorbell offset in qwords */
>>>>>>>> +       __u32   doorbell_offset;
>>>>>>>>
>>>>>>>> That way you couldn't accidently specify an overlapping doorbell.
>>>>>>> Ah, so you only wanted to fix the comment. That was absolutely not
>>>>>>> clear from the discussion.
>>>>>> If I understand this correctly, the offset of the doorbell in the BO is
>>>>>> still is 32-bit, but its width (size in bytes) is 64 bits. Am I getting
>>>>>> that right ?
>>>>> Right.  Each doorbell is 64 bits (8 bytes) so this value would
>>>>> basically be an index into the doorbell bo.  Having it be a 64 bit
>>>>> index rather than a 32 bit index would avoid the possibility of users
>>>>> specifying overlapping doorbells.  E.g.,
>>>>> offset in bytes
>>>>> 0 - doorbell
>>>>> 4 - doorbell
>>>>> Would be incorrect, while
>>>>> offset in bytes
>>>>> 0 - doorbell
>>>>> 8 - doorbell
>>>>> Would be correct.
>>>>>
>>>>> I.e., u64 doorbell_page[512] vs u32 doorbell_page[1024]
>>>> Well I usually prefer just straight byte offsets, but I think the main
>>>> question is what does the underlying hw/fw use?
>>>>
>>>> If that's a dword index we should probably stick with that in the UAPI
>>>> as well. If it's in qword then stick to that, if it's in bytes than use
>>>> that.
>>> The MQD takes a dword offset from the start of the BAR, but the
>>> doorbell is 64 bits wide so we have to be careful that we check for
>>> overlapping doorbells.
>> Well then let's just add an "if (doorbell_idx & 0x1) return -EINVAL;" to
>> the kernel instead.
>>
>> That's far less confusing that having dword in the MQD and qword in the
>> UAPI.
> Yes, agreed.

Got it, Thanks.

- Shashank

>
> Alex
>
>> Christian.
>>
>>> Alex
>>>
>>>> Otherwise we will just confuse people when we convert between the
>>>> different API levels.
>>>>
>>>> Christian.
>>>>
>>>>> Alex
>>>>>
>>>>>> - Shashank
>>>>>>
>>>>>>> Christian.
>>>>>>>
>>>>>>>> Alex
>>>>>>>>
>>>>>>>>> Christian.
>>>>>>>>>
>>>>>>>>>> Alex
>>>>>>>>>>
>>>>>>>>>>>>> +       /** GPU virtual address of the queue */
>>>>>>>>>>>>> +       __u64   queue_va;
>>>>>>>>>>>>> +       /** Size of the queue in bytes */
>>>>>>>>>>>>> +       __u64   queue_size;
>>>>>>>>>>>>> +       /** GPU virtual address of the rptr */
>>>>>>>>>>>>> +       __u64   rptr_va;
>>>>>>>>>>>>> +       /** GPU virtual address of the wptr */
>>>>>>>>>>>>> +       __u64   wptr_va;
>>>>>>>>>>>>> +};
>>>>>>>>>>>>> +
>>>>>>>>>>>>> +struct drm_amdgpu_userq_in {
>>>>>>>>>>>>> +       /** AMDGPU_USERQ_OP_* */
>>>>>>>>>>>>> +       __u32   op;
>>>>>>>>>>>>> +       /** Flags */
>>>>>>>>>>>>> +       __u32   flags;
>>>>>>>>>>>>> +       /** Queue handle to associate the queue free call with,
>>>>>>>>>>>>> +        * unused for queue create calls */
>>>>>>>>>>>>> +       __u32   queue_id;
>>>>>>>>>>>>> +       __u32   pad;
>>>>>>>>>>>>> +       /** Queue descriptor */
>>>>>>>>>>>>> +       struct drm_amdgpu_userq_mqd mqd;
>>>>>>>>>>>>> +};
>>>>>>>>>>>>> +
>>>>>>>>>>>>> +struct drm_amdgpu_userq_out {
>>>>>>>>>>>>> +       /** Queue handle */
>>>>>>>>>>>>> +       __u32   q_id;
>>>>>>>>>>>> Maybe this should be queue_id to match the input.
>>>>>>>>>>> Agree.
>>>>>>>>>>>
>>>>>>>>>>> - Shashank
>>>>>>>>>>>
>>>>>>>>>>>> Alex
>>>>>>>>>>>>
>>>>>>>>>>>>> +       /** Flags */
>>>>>>>>>>>>> +       __u32   flags;
>>>>>>>>>>>>> +};
>>>>>>>>>>>>> +
>>>>>>>>>>>>> +union drm_amdgpu_userq {
>>>>>>>>>>>>> +       struct drm_amdgpu_userq_in in;
>>>>>>>>>>>>> +       struct drm_amdgpu_userq_out out;
>>>>>>>>>>>>> +};
>>>>>>>>>>>>> +
>>>>>>>>>>>>>        /* vm ioctl */
>>>>>>>>>>>>>        #define AMDGPU_VM_OP_RESERVE_VMID      1
>>>>>>>>>>>>>        #define AMDGPU_VM_OP_UNRESERVE_VMID    2
>>>>>>>>>>>>> --
>>>>>>>>>>>>> 2.34.1
>>>>>>>>>>>>>