[PATCH v5 6/6] drm/syncobj: Add a fast path to drm_syncobj_array_find

Christian König christian.koenig at amd.com
Thu Jun 12 12:02:39 UTC 2025


On 6/12/25 12:58, Tvrtko Ursulin wrote:
> 
> On 12/06/2025 08:21, Christian König wrote:
>> On 6/11/25 17:29, Tvrtko Ursulin wrote:
>>>
>>> On 11/06/2025 15:21, Christian König wrote:
>>>> On 6/11/25 16:00, Tvrtko Ursulin wrote:
>>>>> Running the Cyberpunk 2077 benchmark we can observe that the lookup helper
>>>>> is relatively hot, but the 97% of the calls are for a single object. (~3%
>>>>> for two points, and never more than three points. While a more trivial
>>>>> workload like vkmark under Plasma is even more skewed to single point
>>>>> lookups.)
>>>>>
>>>>> Therefore lets add a fast path to bypass the kmalloc_array/kfree and use a
>>>>> pre-allocated stack array for those cases.
>>>>
>>>> Have you considered using memdup_user()? That's using a separate bucket IIRC and might give similar performance.
>>>
>>> I haven't but I can try it. I would be surprised if it made a (positive) difference though.
>>
>> Yeah, it's mostly for extra security I think.
> 
> On this topic, this discussion prompted me to quickly cook up some trivial cleanups for amdgpu to use memdup_user & co where it was easy. Series is on the mailing list but I did not copy you explicitly giving chance for someone else to notice it and off load you a bit.

Yeah, I know I always wanted to give that task to a student or interim :)

Alex is the one usually picking up amdgpu patches from the mailing list, but I'm happy to add an rb if necessary.

>>> And I realised I need to repeat the benchmarks anyway, since in v4 I had to stop doing access_ok+__get_user, after kernel test robot let me know 64-bit get_user is a not a thing on all platforms. I thought the gains are from avoiding allocations but, as you say, now I need to see if copy_from_user doesn't nullify them..
>>>
>>>> If that is still not sufficient I'm really wondering if we shouldn't have a macro for doing this. It's a really common use case as far as I can see.
>>>
>>> Hmm macro for what exactly?
>>
>> Like a macro which uses an array on the stack for small (<4) number of values and k(v)malloc() for large ones.
>>
>> IIRC there is also a relatively new functionality which allows releasing the memory automatically when we leave the function.
> 
> Okay I will have a look at all those options. But it's going to the bottom of my priority pile so it might be a while.

I'm also perfectly fine with the solution you came up in those patches here for now if that improves the performance at hand.

Just wanted to point out that it is possible that somebody has an use case where X sync_obj handles are asked for timeline fences and that now becomes slower because of that here.

Regards,
Christian.

> 
> Regards,
> 
> Tvrtko
> 
>>>>> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin at igalia.com>
>>>>> Reviewed-by: Maíra Canal <mcanal at igalia.com>
>>>>> ---
>>>>> v2:
>>>>>    * Added comments describing how the fast path arrays were sized.
>>>>>    * Make container freeing criteria clearer by using a boolean.
>>>>> ---
>>>>>    drivers/gpu/drm/drm_syncobj.c | 56 +++++++++++++++++++++++++++--------
>>>>>    1 file changed, 44 insertions(+), 12 deletions(-)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/drm_syncobj.c b/drivers/gpu/drm/drm_syncobj.c
>>>>> index be5905dca87f..65c301852f0d 100644
>>>>> --- a/drivers/gpu/drm/drm_syncobj.c
>>>>> +++ b/drivers/gpu/drm/drm_syncobj.c
>>>>> @@ -1259,6 +1259,8 @@ EXPORT_SYMBOL(drm_timeout_abs_to_jiffies);
>>>>>    static int drm_syncobj_array_find(struct drm_file *file_private,
>>>>>                      u32 __user *handles,
>>>>>                      uint32_t count,
>>>>> +                  struct drm_syncobj **stack_syncobjs,
>>>>> +                  u32 stack_count,
>>>>>                      struct drm_syncobj ***syncobjs_out)
>>>>>    {
>>>>>        struct drm_syncobj **syncobjs;
>>>>> @@ -1268,9 +1270,13 @@ static int drm_syncobj_array_find(struct drm_file *file_private,
>>>>>        if (!access_ok(handles, count * sizeof(*handles)))
>>>>>            return -EFAULT;
>>>>>    -    syncobjs = kmalloc_array(count, sizeof(*syncobjs), GFP_KERNEL);
>>>>> -    if (!syncobjs)
>>>>> -        return -ENOMEM;
>>>>> +    if (count > stack_count) {
>>>>> +        syncobjs = kmalloc_array(count, sizeof(*syncobjs), GFP_KERNEL);
>>>>> +        if (!syncobjs)
>>>>> +            return -ENOMEM;
>>>>> +    } else {
>>>>> +        syncobjs = stack_syncobjs;
>>>>> +    }
>>>>>          for (i = 0; i < count; i++) {
>>>>>            u32 handle;
>>>>> @@ -1292,25 +1298,31 @@ static int drm_syncobj_array_find(struct drm_file *file_private,
>>>>>    err_put_syncobjs:
>>>>>        while (i-- > 0)
>>>>>            drm_syncobj_put(syncobjs[i]);
>>>>> -    kfree(syncobjs);
>>>>> +
>>>>> +    if (syncobjs != stack_syncobjs)
>>>>> +        kfree(syncobjs);
>>>>>          return ret;
>>>>>    }
>>>>>      static void drm_syncobj_array_free(struct drm_syncobj **syncobjs,
>>>>> -                   uint32_t count)
>>>>> +                   uint32_t count,
>>>>> +                   bool free_container)
>>>>>    {
>>>>>        uint32_t i;
>>>>>          for (i = 0; i < count; i++)
>>>>>            drm_syncobj_put(syncobjs[i]);
>>>>> -    kfree(syncobjs);
>>>>> +
>>>>> +    if (free_container)
>>>>> +        kfree(syncobjs);
>>>>>    }
>>>>>      int
>>>>>    drm_syncobj_wait_ioctl(struct drm_device *dev, void *data,
>>>>>                   struct drm_file *file_private)
>>>>>    {
>>>>> +    struct drm_syncobj *stack_syncobjs[DRM_SYNCOBJ_FAST_PATH_ENTRIES];
>>>>>        struct drm_syncobj_wait *args = data;
>>>>>        ktime_t deadline, *pdeadline = NULL;
>>>>>        u32 count = args->count_handles;
>>>>> @@ -1336,6 +1348,8 @@ drm_syncobj_wait_ioctl(struct drm_device *dev, void *data,
>>>>>        ret = drm_syncobj_array_find(file_private,
>>>>>                         u64_to_user_ptr(args->handles),
>>>>>                         count,
>>>>> +                     stack_syncobjs,
>>>>> +                     ARRAY_SIZE(stack_syncobjs),
>>>>>                         &syncobjs);
>>>>>        if (ret < 0)
>>>>>            return ret;
>>>>> @@ -1354,7 +1368,7 @@ drm_syncobj_wait_ioctl(struct drm_device *dev, void *data,
>>>>>                             &first,
>>>>>                             pdeadline);
>>>>>    -    drm_syncobj_array_free(syncobjs, count);
>>>>> +    drm_syncobj_array_free(syncobjs, count, syncobjs != stack_syncobjs);
>>>>>          if (timeout < 0)
>>>>>            return timeout;
>>>>> @@ -1368,6 +1382,7 @@ int
>>>>>    drm_syncobj_timeline_wait_ioctl(struct drm_device *dev, void *data,
>>>>>                    struct drm_file *file_private)
>>>>>    {
>>>>> +    struct drm_syncobj *stack_syncobjs[DRM_SYNCOBJ_FAST_PATH_ENTRIES];
>>>>>        struct drm_syncobj_timeline_wait *args = data;
>>>>>        ktime_t deadline, *pdeadline = NULL;
>>>>>        u32 count = args->count_handles;
>>>>> @@ -1394,6 +1409,8 @@ drm_syncobj_timeline_wait_ioctl(struct drm_device *dev, void *data,
>>>>>        ret = drm_syncobj_array_find(file_private,
>>>>>                         u64_to_user_ptr(args->handles),
>>>>>                         count,
>>>>> +                     stack_syncobjs,
>>>>> +                     ARRAY_SIZE(stack_syncobjs),
>>>>>                         &syncobjs);
>>>>>        if (ret < 0)
>>>>>            return ret;
>>>>> @@ -1412,7 +1429,7 @@ drm_syncobj_timeline_wait_ioctl(struct drm_device *dev, void *data,
>>>>>                             &first,
>>>>>                             pdeadline);
>>>>>    -    drm_syncobj_array_free(syncobjs, count);
>>>>> +    drm_syncobj_array_free(syncobjs, count, syncobjs != stack_syncobjs);
>>>>>          if (timeout < 0)
>>>>>            return timeout;
>>>>> @@ -1529,6 +1546,7 @@ int
>>>>>    drm_syncobj_reset_ioctl(struct drm_device *dev, void *data,
>>>>>                struct drm_file *file_private)
>>>>>    {
>>>>> +    struct drm_syncobj *stack_syncobjs[DRM_SYNCOBJ_FAST_PATH_ENTRIES];
>>>>>        struct drm_syncobj_array *args = data;
>>>>>        struct drm_syncobj **syncobjs;
>>>>>        uint32_t i;
>>>>> @@ -1546,6 +1564,8 @@ drm_syncobj_reset_ioctl(struct drm_device *dev, void *data,
>>>>>        ret = drm_syncobj_array_find(file_private,
>>>>>                         u64_to_user_ptr(args->handles),
>>>>>                         args->count_handles,
>>>>> +                     stack_syncobjs,
>>>>> +                     ARRAY_SIZE(stack_syncobjs),
>>>>>                         &syncobjs);
>>>>>        if (ret < 0)
>>>>>            return ret;
>>>>> @@ -1553,7 +1573,8 @@ drm_syncobj_reset_ioctl(struct drm_device *dev, void *data,
>>>>>        for (i = 0; i < args->count_handles; i++)
>>>>>            drm_syncobj_replace_fence(syncobjs[i], NULL);
>>>>>    -    drm_syncobj_array_free(syncobjs, args->count_handles);
>>>>> +    drm_syncobj_array_free(syncobjs, args->count_handles,
>>>>> +                   syncobjs != stack_syncobjs);
>>>>>          return 0;
>>>>>    }
>>>>> @@ -1562,6 +1583,7 @@ int
>>>>>    drm_syncobj_signal_ioctl(struct drm_device *dev, void *data,
>>>>>                 struct drm_file *file_private)
>>>>>    {
>>>>> +    struct drm_syncobj *stack_syncobjs[DRM_SYNCOBJ_FAST_PATH_ENTRIES];
>>>>>        struct drm_syncobj_array *args = data;
>>>>>        struct drm_syncobj **syncobjs;
>>>>>        uint32_t i;
>>>>> @@ -1579,6 +1601,8 @@ drm_syncobj_signal_ioctl(struct drm_device *dev, void *data,
>>>>>        ret = drm_syncobj_array_find(file_private,
>>>>>                         u64_to_user_ptr(args->handles),
>>>>>                         args->count_handles,
>>>>> +                     stack_syncobjs,
>>>>> +                     ARRAY_SIZE(stack_syncobjs),
>>>>>                         &syncobjs);
>>>>>        if (ret < 0)
>>>>>            return ret;
>>>>> @@ -1589,7 +1613,8 @@ drm_syncobj_signal_ioctl(struct drm_device *dev, void *data,
>>>>>                break;
>>>>>        }
>>>>>    -    drm_syncobj_array_free(syncobjs, args->count_handles);
>>>>> +    drm_syncobj_array_free(syncobjs, args->count_handles,
>>>>> +                   syncobjs != stack_syncobjs);
>>>>>          return ret;
>>>>>    }
>>>>> @@ -1598,6 +1623,7 @@ int
>>>>>    drm_syncobj_timeline_signal_ioctl(struct drm_device *dev, void *data,
>>>>>                      struct drm_file *file_private)
>>>>>    {
>>>>> +    struct drm_syncobj *stack_syncobjs[DRM_SYNCOBJ_FAST_PATH_ENTRIES];
>>>>>        struct drm_syncobj_timeline_array *args = data;
>>>>>        uint64_t __user *points = u64_to_user_ptr(args->points);
>>>>>        uint32_t i, j, count = args->count_handles;
>>>>> @@ -1617,6 +1643,8 @@ drm_syncobj_timeline_signal_ioctl(struct drm_device *dev, void *data,
>>>>>        ret = drm_syncobj_array_find(file_private,
>>>>>                         u64_to_user_ptr(args->handles),
>>>>>                         count,
>>>>> +                     stack_syncobjs,
>>>>> +                     ARRAY_SIZE(stack_syncobjs),
>>>>>                         &syncobjs);
>>>>>        if (ret < 0)
>>>>>            return ret;
>>>>> @@ -1653,7 +1681,7 @@ drm_syncobj_timeline_signal_ioctl(struct drm_device *dev, void *data,
>>>>>    err_chains:
>>>>>        kfree(chains);
>>>>>    out:
>>>>> -    drm_syncobj_array_free(syncobjs, count);
>>>>> +    drm_syncobj_array_free(syncobjs, count, syncobjs != stack_syncobjs);
>>>>>          return ret;
>>>>>    }
>>>>> @@ -1661,6 +1689,7 @@ drm_syncobj_timeline_signal_ioctl(struct drm_device *dev, void *data,
>>>>>    int drm_syncobj_query_ioctl(struct drm_device *dev, void *data,
>>>>>                    struct drm_file *file_private)
>>>>>    {
>>>>> +    struct drm_syncobj *stack_syncobjs[DRM_SYNCOBJ_FAST_PATH_ENTRIES];
>>>>>        struct drm_syncobj_timeline_array *args = data;
>>>>>        struct drm_syncobj **syncobjs;
>>>>>        uint64_t __user *points = u64_to_user_ptr(args->points);
>>>>> @@ -1679,6 +1708,8 @@ int drm_syncobj_query_ioctl(struct drm_device *dev, void *data,
>>>>>        ret = drm_syncobj_array_find(file_private,
>>>>>                         u64_to_user_ptr(args->handles),
>>>>>                         args->count_handles,
>>>>> +                     stack_syncobjs,
>>>>> +                     ARRAY_SIZE(stack_syncobjs),
>>>>>                         &syncobjs);
>>>>>        if (ret < 0)
>>>>>            return ret;
>>>>> @@ -1722,7 +1753,8 @@ int drm_syncobj_query_ioctl(struct drm_device *dev, void *data,
>>>>>            if (ret)
>>>>>                break;
>>>>>        }
>>>>> -    drm_syncobj_array_free(syncobjs, args->count_handles);
>>>>> +    drm_syncobj_array_free(syncobjs, args->count_handles,
>>>>> +                   syncobjs != stack_syncobjs);
>>>>>          return ret;
>>>>>    }
>>>>
>>>
>>
> 



More information about the dri-devel mailing list