[PATCH v5 6/6] drm/syncobj: Add a fast path to drm_syncobj_array_find
Tvrtko Ursulin
tvrtko.ursulin at igalia.com
Thu Jun 12 10:58:15 UTC 2025
On 12/06/2025 08:21, Christian König wrote:
> On 6/11/25 17:29, Tvrtko Ursulin wrote:
>>
>> On 11/06/2025 15:21, Christian König wrote:
>>> On 6/11/25 16:00, Tvrtko Ursulin wrote:
>>>> Running the Cyberpunk 2077 benchmark we can observe that the lookup helper
>>>> is relatively hot, but the 97% of the calls are for a single object. (~3%
>>>> for two points, and never more than three points. While a more trivial
>>>> workload like vkmark under Plasma is even more skewed to single point
>>>> lookups.)
>>>>
>>>> Therefore lets add a fast path to bypass the kmalloc_array/kfree and use a
>>>> pre-allocated stack array for those cases.
>>>
>>> Have you considered using memdup_user()? That's using a separate bucket IIRC and might give similar performance.
>>
>> I haven't but I can try it. I would be surprised if it made a (positive) difference though.
>
> Yeah, it's mostly for extra security I think.
On this topic, this discussion prompted me to quickly cook up some
trivial cleanups for amdgpu to use memdup_user & co where it was easy.
Series is on the mailing list but I did not copy you explicitly giving
chance for someone else to notice it and off load you a bit.
>> And I realised I need to repeat the benchmarks anyway, since in v4 I had to stop doing access_ok+__get_user, after kernel test robot let me know 64-bit get_user is a not a thing on all platforms. I thought the gains are from avoiding allocations but, as you say, now I need to see if copy_from_user doesn't nullify them..
>>
>>> If that is still not sufficient I'm really wondering if we shouldn't have a macro for doing this. It's a really common use case as far as I can see.
>>
>> Hmm macro for what exactly?
>
> Like a macro which uses an array on the stack for small (<4) number of values and k(v)malloc() for large ones.
>
> IIRC there is also a relatively new functionality which allows releasing the memory automatically when we leave the function.
Okay I will have a look at all those options. But it's going to the
bottom of my priority pile so it might be a while.
Regards,
Tvrtko
>>>> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin at igalia.com>
>>>> Reviewed-by: Maíra Canal <mcanal at igalia.com>
>>>> ---
>>>> v2:
>>>> * Added comments describing how the fast path arrays were sized.
>>>> * Make container freeing criteria clearer by using a boolean.
>>>> ---
>>>> drivers/gpu/drm/drm_syncobj.c | 56 +++++++++++++++++++++++++++--------
>>>> 1 file changed, 44 insertions(+), 12 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/drm_syncobj.c b/drivers/gpu/drm/drm_syncobj.c
>>>> index be5905dca87f..65c301852f0d 100644
>>>> --- a/drivers/gpu/drm/drm_syncobj.c
>>>> +++ b/drivers/gpu/drm/drm_syncobj.c
>>>> @@ -1259,6 +1259,8 @@ EXPORT_SYMBOL(drm_timeout_abs_to_jiffies);
>>>> static int drm_syncobj_array_find(struct drm_file *file_private,
>>>> u32 __user *handles,
>>>> uint32_t count,
>>>> + struct drm_syncobj **stack_syncobjs,
>>>> + u32 stack_count,
>>>> struct drm_syncobj ***syncobjs_out)
>>>> {
>>>> struct drm_syncobj **syncobjs;
>>>> @@ -1268,9 +1270,13 @@ static int drm_syncobj_array_find(struct drm_file *file_private,
>>>> if (!access_ok(handles, count * sizeof(*handles)))
>>>> return -EFAULT;
>>>> - syncobjs = kmalloc_array(count, sizeof(*syncobjs), GFP_KERNEL);
>>>> - if (!syncobjs)
>>>> - return -ENOMEM;
>>>> + if (count > stack_count) {
>>>> + syncobjs = kmalloc_array(count, sizeof(*syncobjs), GFP_KERNEL);
>>>> + if (!syncobjs)
>>>> + return -ENOMEM;
>>>> + } else {
>>>> + syncobjs = stack_syncobjs;
>>>> + }
>>>> for (i = 0; i < count; i++) {
>>>> u32 handle;
>>>> @@ -1292,25 +1298,31 @@ static int drm_syncobj_array_find(struct drm_file *file_private,
>>>> err_put_syncobjs:
>>>> while (i-- > 0)
>>>> drm_syncobj_put(syncobjs[i]);
>>>> - kfree(syncobjs);
>>>> +
>>>> + if (syncobjs != stack_syncobjs)
>>>> + kfree(syncobjs);
>>>> return ret;
>>>> }
>>>> static void drm_syncobj_array_free(struct drm_syncobj **syncobjs,
>>>> - uint32_t count)
>>>> + uint32_t count,
>>>> + bool free_container)
>>>> {
>>>> uint32_t i;
>>>> for (i = 0; i < count; i++)
>>>> drm_syncobj_put(syncobjs[i]);
>>>> - kfree(syncobjs);
>>>> +
>>>> + if (free_container)
>>>> + kfree(syncobjs);
>>>> }
>>>> int
>>>> drm_syncobj_wait_ioctl(struct drm_device *dev, void *data,
>>>> struct drm_file *file_private)
>>>> {
>>>> + struct drm_syncobj *stack_syncobjs[DRM_SYNCOBJ_FAST_PATH_ENTRIES];
>>>> struct drm_syncobj_wait *args = data;
>>>> ktime_t deadline, *pdeadline = NULL;
>>>> u32 count = args->count_handles;
>>>> @@ -1336,6 +1348,8 @@ drm_syncobj_wait_ioctl(struct drm_device *dev, void *data,
>>>> ret = drm_syncobj_array_find(file_private,
>>>> u64_to_user_ptr(args->handles),
>>>> count,
>>>> + stack_syncobjs,
>>>> + ARRAY_SIZE(stack_syncobjs),
>>>> &syncobjs);
>>>> if (ret < 0)
>>>> return ret;
>>>> @@ -1354,7 +1368,7 @@ drm_syncobj_wait_ioctl(struct drm_device *dev, void *data,
>>>> &first,
>>>> pdeadline);
>>>> - drm_syncobj_array_free(syncobjs, count);
>>>> + drm_syncobj_array_free(syncobjs, count, syncobjs != stack_syncobjs);
>>>> if (timeout < 0)
>>>> return timeout;
>>>> @@ -1368,6 +1382,7 @@ int
>>>> drm_syncobj_timeline_wait_ioctl(struct drm_device *dev, void *data,
>>>> struct drm_file *file_private)
>>>> {
>>>> + struct drm_syncobj *stack_syncobjs[DRM_SYNCOBJ_FAST_PATH_ENTRIES];
>>>> struct drm_syncobj_timeline_wait *args = data;
>>>> ktime_t deadline, *pdeadline = NULL;
>>>> u32 count = args->count_handles;
>>>> @@ -1394,6 +1409,8 @@ drm_syncobj_timeline_wait_ioctl(struct drm_device *dev, void *data,
>>>> ret = drm_syncobj_array_find(file_private,
>>>> u64_to_user_ptr(args->handles),
>>>> count,
>>>> + stack_syncobjs,
>>>> + ARRAY_SIZE(stack_syncobjs),
>>>> &syncobjs);
>>>> if (ret < 0)
>>>> return ret;
>>>> @@ -1412,7 +1429,7 @@ drm_syncobj_timeline_wait_ioctl(struct drm_device *dev, void *data,
>>>> &first,
>>>> pdeadline);
>>>> - drm_syncobj_array_free(syncobjs, count);
>>>> + drm_syncobj_array_free(syncobjs, count, syncobjs != stack_syncobjs);
>>>> if (timeout < 0)
>>>> return timeout;
>>>> @@ -1529,6 +1546,7 @@ int
>>>> drm_syncobj_reset_ioctl(struct drm_device *dev, void *data,
>>>> struct drm_file *file_private)
>>>> {
>>>> + struct drm_syncobj *stack_syncobjs[DRM_SYNCOBJ_FAST_PATH_ENTRIES];
>>>> struct drm_syncobj_array *args = data;
>>>> struct drm_syncobj **syncobjs;
>>>> uint32_t i;
>>>> @@ -1546,6 +1564,8 @@ drm_syncobj_reset_ioctl(struct drm_device *dev, void *data,
>>>> ret = drm_syncobj_array_find(file_private,
>>>> u64_to_user_ptr(args->handles),
>>>> args->count_handles,
>>>> + stack_syncobjs,
>>>> + ARRAY_SIZE(stack_syncobjs),
>>>> &syncobjs);
>>>> if (ret < 0)
>>>> return ret;
>>>> @@ -1553,7 +1573,8 @@ drm_syncobj_reset_ioctl(struct drm_device *dev, void *data,
>>>> for (i = 0; i < args->count_handles; i++)
>>>> drm_syncobj_replace_fence(syncobjs[i], NULL);
>>>> - drm_syncobj_array_free(syncobjs, args->count_handles);
>>>> + drm_syncobj_array_free(syncobjs, args->count_handles,
>>>> + syncobjs != stack_syncobjs);
>>>> return 0;
>>>> }
>>>> @@ -1562,6 +1583,7 @@ int
>>>> drm_syncobj_signal_ioctl(struct drm_device *dev, void *data,
>>>> struct drm_file *file_private)
>>>> {
>>>> + struct drm_syncobj *stack_syncobjs[DRM_SYNCOBJ_FAST_PATH_ENTRIES];
>>>> struct drm_syncobj_array *args = data;
>>>> struct drm_syncobj **syncobjs;
>>>> uint32_t i;
>>>> @@ -1579,6 +1601,8 @@ drm_syncobj_signal_ioctl(struct drm_device *dev, void *data,
>>>> ret = drm_syncobj_array_find(file_private,
>>>> u64_to_user_ptr(args->handles),
>>>> args->count_handles,
>>>> + stack_syncobjs,
>>>> + ARRAY_SIZE(stack_syncobjs),
>>>> &syncobjs);
>>>> if (ret < 0)
>>>> return ret;
>>>> @@ -1589,7 +1613,8 @@ drm_syncobj_signal_ioctl(struct drm_device *dev, void *data,
>>>> break;
>>>> }
>>>> - drm_syncobj_array_free(syncobjs, args->count_handles);
>>>> + drm_syncobj_array_free(syncobjs, args->count_handles,
>>>> + syncobjs != stack_syncobjs);
>>>> return ret;
>>>> }
>>>> @@ -1598,6 +1623,7 @@ int
>>>> drm_syncobj_timeline_signal_ioctl(struct drm_device *dev, void *data,
>>>> struct drm_file *file_private)
>>>> {
>>>> + struct drm_syncobj *stack_syncobjs[DRM_SYNCOBJ_FAST_PATH_ENTRIES];
>>>> struct drm_syncobj_timeline_array *args = data;
>>>> uint64_t __user *points = u64_to_user_ptr(args->points);
>>>> uint32_t i, j, count = args->count_handles;
>>>> @@ -1617,6 +1643,8 @@ drm_syncobj_timeline_signal_ioctl(struct drm_device *dev, void *data,
>>>> ret = drm_syncobj_array_find(file_private,
>>>> u64_to_user_ptr(args->handles),
>>>> count,
>>>> + stack_syncobjs,
>>>> + ARRAY_SIZE(stack_syncobjs),
>>>> &syncobjs);
>>>> if (ret < 0)
>>>> return ret;
>>>> @@ -1653,7 +1681,7 @@ drm_syncobj_timeline_signal_ioctl(struct drm_device *dev, void *data,
>>>> err_chains:
>>>> kfree(chains);
>>>> out:
>>>> - drm_syncobj_array_free(syncobjs, count);
>>>> + drm_syncobj_array_free(syncobjs, count, syncobjs != stack_syncobjs);
>>>> return ret;
>>>> }
>>>> @@ -1661,6 +1689,7 @@ drm_syncobj_timeline_signal_ioctl(struct drm_device *dev, void *data,
>>>> int drm_syncobj_query_ioctl(struct drm_device *dev, void *data,
>>>> struct drm_file *file_private)
>>>> {
>>>> + struct drm_syncobj *stack_syncobjs[DRM_SYNCOBJ_FAST_PATH_ENTRIES];
>>>> struct drm_syncobj_timeline_array *args = data;
>>>> struct drm_syncobj **syncobjs;
>>>> uint64_t __user *points = u64_to_user_ptr(args->points);
>>>> @@ -1679,6 +1708,8 @@ int drm_syncobj_query_ioctl(struct drm_device *dev, void *data,
>>>> ret = drm_syncobj_array_find(file_private,
>>>> u64_to_user_ptr(args->handles),
>>>> args->count_handles,
>>>> + stack_syncobjs,
>>>> + ARRAY_SIZE(stack_syncobjs),
>>>> &syncobjs);
>>>> if (ret < 0)
>>>> return ret;
>>>> @@ -1722,7 +1753,8 @@ int drm_syncobj_query_ioctl(struct drm_device *dev, void *data,
>>>> if (ret)
>>>> break;
>>>> }
>>>> - drm_syncobj_array_free(syncobjs, args->count_handles);
>>>> + drm_syncobj_array_free(syncobjs, args->count_handles,
>>>> + syncobjs != stack_syncobjs);
>>>> return ret;
>>>> }
>>>
>>
>
More information about the dri-devel
mailing list