[PATCH v2 7/7] drm/syncobj: Add a fast path to drm_syncobj_array_find
Maíra Canal
mcanal at igalia.com
Thu Mar 27 13:45:43 UTC 2025
Hi Tvrtko,
On 27/03/25 05:42, Tvrtko Ursulin wrote:
> Running the Cyberpunk 2077 benchmark we can observe that the lookup helper
> is relatively hot, but the 97% of the calls are for a single object. (~3%
> for two points, and never more than three points. While a more trivial
> workload like vkmark under Plasma is even more skewed to single point
> lookups.)
>
> Therefore lets add a fast path to bypass the kmalloc_array/kfree and use a
> pre-allocated stack array for those cases.
>
> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin at igalia.com>
> Reviewed-by: Maíra Canal <mcanal at igalia.com>
> ---
> v2:
> * Added comments describing how the fast path arrays were sized.
> * Make container freeing criteria clearer by using a boolean.
> ---
> drivers/gpu/drm/drm_syncobj.c | 71 ++++++++++++++++++++++++++---------
> 1 file changed, 53 insertions(+), 18 deletions(-)
>
> diff --git a/drivers/gpu/drm/drm_syncobj.c b/drivers/gpu/drm/drm_syncobj.c
> index b906d6acb4ef..d5b99bfea9a5 100644
> --- a/drivers/gpu/drm/drm_syncobj.c
> +++ b/drivers/gpu/drm/drm_syncobj.c
> @@ -236,6 +236,14 @@ static void
> syncobj_eventfd_entry_func(struct drm_syncobj *syncobj,
> struct syncobj_eventfd_entry *entry);
>
> +/*
> + * Empirically vast majority of ioctls pass in a single syncobj (96%) and never
> + * more than three points. Therefore implement a fast path with a small stack
> + * array to avoid going into the allocator sometimes several times per
> + * userspace rendered frame.
> + */
> +#define DRM_SYNCOBJ_FAST_PATH_ENTRIES 4
> +
> /**
> * drm_syncobj_find - lookup and reference a sync object.
> * @file_private: drm file private pointer
> @@ -1035,12 +1043,7 @@ static signed long drm_syncobj_array_wait_timeout(struct drm_syncobj **syncobjs,
> uint32_t *idx,
> ktime_t *deadline)
> {
> - /*
> - * Empirically vast majority of calls here works with just a single
> - * point (96%) and never more than three points. Therefore a small stack
> - * array can cheaply avoid multiple per frame allocations.
> - */
> - struct syncobj_wait_entry stack_entries[4];
> + struct syncobj_wait_entry stack_entries[DRM_SYNCOBJ_FAST_PATH_ENTRIES];
Could you introduce this change in 6/7 to avoid changing the lines you
introduced earlier?
Best Regards,
- Maíra
> struct syncobj_wait_entry *entries;
> uint32_t signaled_count, i;
> struct dma_fence *fence;
> @@ -1228,6 +1231,8 @@ EXPORT_SYMBOL(drm_timeout_abs_to_jiffies);
> static int drm_syncobj_array_find(struct drm_file *file_private,
> u32 __user *handles,
> uint32_t count,
> + struct drm_syncobj **stack_syncobjs,
> + u32 stack_count,
> struct drm_syncobj ***syncobjs_out)
> {
> struct drm_syncobj **syncobjs;
> @@ -1237,9 +1242,13 @@ static int drm_syncobj_array_find(struct drm_file *file_private,
> if (!access_ok(handles, count * sizeof(*handles)))
> return -EFAULT;
>
> - syncobjs = kmalloc_array(count, sizeof(*syncobjs), GFP_KERNEL);
> - if (!syncobjs)
> - return -ENOMEM;
> + if (count > stack_count) {
> + syncobjs = kmalloc_array(count, sizeof(*syncobjs), GFP_KERNEL);
> + if (!syncobjs)
> + return -ENOMEM;
> + } else {
> + syncobjs = stack_syncobjs;
> + }
>
> for (i = 0; i < count; i++) {
> u32 handle;
> @@ -1261,25 +1270,31 @@ static int drm_syncobj_array_find(struct drm_file *file_private,
> err_put_syncobjs:
> while (i-- > 0)
> drm_syncobj_put(syncobjs[i]);
> - kfree(syncobjs);
> +
> + if (syncobjs != stack_syncobjs)
> + kfree(syncobjs);
>
> return ret;
> }
>
> static void drm_syncobj_array_free(struct drm_syncobj **syncobjs,
> - uint32_t count)
> + uint32_t count,
> + bool free_container)
> {
> uint32_t i;
>
> for (i = 0; i < count; i++)
> drm_syncobj_put(syncobjs[i]);
> - kfree(syncobjs);
> +
> + if (free_container)
> + kfree(syncobjs);
> }
>
> int
> drm_syncobj_wait_ioctl(struct drm_device *dev, void *data,
> struct drm_file *file_private)
> {
> + struct drm_syncobj *stack_syncobjs[DRM_SYNCOBJ_FAST_PATH_ENTRIES];
> struct drm_syncobj_wait *args = data;
> ktime_t deadline, *pdeadline = NULL;
> u32 count = args->count_handles;
> @@ -1305,6 +1320,8 @@ drm_syncobj_wait_ioctl(struct drm_device *dev, void *data,
> ret = drm_syncobj_array_find(file_private,
> u64_to_user_ptr(args->handles),
> count,
> + stack_syncobjs,
> + ARRAY_SIZE(stack_syncobjs),
> &syncobjs);
> if (ret < 0)
> return ret;
> @@ -1323,7 +1340,7 @@ drm_syncobj_wait_ioctl(struct drm_device *dev, void *data,
> &first,
> pdeadline);
>
> - drm_syncobj_array_free(syncobjs, count);
> + drm_syncobj_array_free(syncobjs, count, syncobjs != stack_syncobjs);
>
> if (timeout < 0)
> return timeout;
> @@ -1337,6 +1354,7 @@ int
> drm_syncobj_timeline_wait_ioctl(struct drm_device *dev, void *data,
> struct drm_file *file_private)
> {
> + struct drm_syncobj *stack_syncobjs[DRM_SYNCOBJ_FAST_PATH_ENTRIES];
> struct drm_syncobj_timeline_wait *args = data;
> ktime_t deadline, *pdeadline = NULL;
> u32 count = args->count_handles;
> @@ -1363,6 +1381,8 @@ drm_syncobj_timeline_wait_ioctl(struct drm_device *dev, void *data,
> ret = drm_syncobj_array_find(file_private,
> u64_to_user_ptr(args->handles),
> count,
> + stack_syncobjs,
> + ARRAY_SIZE(stack_syncobjs),
> &syncobjs);
> if (ret < 0)
> return ret;
> @@ -1381,7 +1401,7 @@ drm_syncobj_timeline_wait_ioctl(struct drm_device *dev, void *data,
> &first,
> pdeadline);
>
> - drm_syncobj_array_free(syncobjs, count);
> + drm_syncobj_array_free(syncobjs, count, syncobjs != stack_syncobjs);
>
> if (timeout < 0)
> return timeout;
> @@ -1498,6 +1518,7 @@ int
> drm_syncobj_reset_ioctl(struct drm_device *dev, void *data,
> struct drm_file *file_private)
> {
> + struct drm_syncobj *stack_syncobjs[DRM_SYNCOBJ_FAST_PATH_ENTRIES];
> struct drm_syncobj_array *args = data;
> struct drm_syncobj **syncobjs;
> uint32_t i;
> @@ -1515,6 +1536,8 @@ drm_syncobj_reset_ioctl(struct drm_device *dev, void *data,
> ret = drm_syncobj_array_find(file_private,
> u64_to_user_ptr(args->handles),
> args->count_handles,
> + stack_syncobjs,
> + ARRAY_SIZE(stack_syncobjs),
> &syncobjs);
> if (ret < 0)
> return ret;
> @@ -1522,7 +1545,8 @@ drm_syncobj_reset_ioctl(struct drm_device *dev, void *data,
> for (i = 0; i < args->count_handles; i++)
> drm_syncobj_replace_fence(syncobjs[i], NULL);
>
> - drm_syncobj_array_free(syncobjs, args->count_handles);
> + drm_syncobj_array_free(syncobjs, args->count_handles,
> + syncobjs != stack_syncobjs);
>
> return 0;
> }
> @@ -1531,6 +1555,7 @@ int
> drm_syncobj_signal_ioctl(struct drm_device *dev, void *data,
> struct drm_file *file_private)
> {
> + struct drm_syncobj *stack_syncobjs[DRM_SYNCOBJ_FAST_PATH_ENTRIES];
> struct drm_syncobj_array *args = data;
> struct drm_syncobj **syncobjs;
> uint32_t i;
> @@ -1548,6 +1573,8 @@ drm_syncobj_signal_ioctl(struct drm_device *dev, void *data,
> ret = drm_syncobj_array_find(file_private,
> u64_to_user_ptr(args->handles),
> args->count_handles,
> + stack_syncobjs,
> + ARRAY_SIZE(stack_syncobjs),
> &syncobjs);
> if (ret < 0)
> return ret;
> @@ -1558,7 +1585,8 @@ drm_syncobj_signal_ioctl(struct drm_device *dev, void *data,
> break;
> }
>
> - drm_syncobj_array_free(syncobjs, args->count_handles);
> + drm_syncobj_array_free(syncobjs, args->count_handles,
> + syncobjs != stack_syncobjs);
>
> return ret;
> }
> @@ -1567,6 +1595,7 @@ int
> drm_syncobj_timeline_signal_ioctl(struct drm_device *dev, void *data,
> struct drm_file *file_private)
> {
> + struct drm_syncobj *stack_syncobjs[DRM_SYNCOBJ_FAST_PATH_ENTRIES];
> struct drm_syncobj_timeline_array *args = data;
> uint64_t __user *points = u64_to_user_ptr(args->points);
> uint32_t i, j, count = args->count_handles;
> @@ -1589,6 +1618,8 @@ drm_syncobj_timeline_signal_ioctl(struct drm_device *dev, void *data,
> ret = drm_syncobj_array_find(file_private,
> u64_to_user_ptr(args->handles),
> count,
> + stack_syncobjs,
> + ARRAY_SIZE(stack_syncobjs),
> &syncobjs);
> if (ret < 0)
> return ret;
> @@ -1625,7 +1656,7 @@ drm_syncobj_timeline_signal_ioctl(struct drm_device *dev, void *data,
> err_chains:
> kfree(chains);
> out:
> - drm_syncobj_array_free(syncobjs, count);
> + drm_syncobj_array_free(syncobjs, count, syncobjs != stack_syncobjs);
>
> return ret;
> }
> @@ -1633,6 +1664,7 @@ drm_syncobj_timeline_signal_ioctl(struct drm_device *dev, void *data,
> int drm_syncobj_query_ioctl(struct drm_device *dev, void *data,
> struct drm_file *file_private)
> {
> + struct drm_syncobj *stack_syncobjs[DRM_SYNCOBJ_FAST_PATH_ENTRIES];
> struct drm_syncobj_timeline_array *args = data;
> struct drm_syncobj **syncobjs;
> uint64_t __user *points = u64_to_user_ptr(args->points);
> @@ -1654,6 +1686,8 @@ int drm_syncobj_query_ioctl(struct drm_device *dev, void *data,
> ret = drm_syncobj_array_find(file_private,
> u64_to_user_ptr(args->handles),
> args->count_handles,
> + stack_syncobjs,
> + ARRAY_SIZE(stack_syncobjs),
> &syncobjs);
> if (ret < 0)
> return ret;
> @@ -1697,7 +1731,8 @@ int drm_syncobj_query_ioctl(struct drm_device *dev, void *data,
> break;
> }
> }
> - drm_syncobj_array_free(syncobjs, args->count_handles);
> + drm_syncobj_array_free(syncobjs, args->count_handles,
> + syncobjs != stack_syncobjs);
>
> return ret;
> }
More information about the dri-devel
mailing list