[PATCH 4/7] drm/xe/oa: Signal output fences
Dixit, Ashutosh
ashutosh.dixit at intel.com
Tue Sep 17 22:18:03 UTC 2024
On Fri, 30 Aug 2024 15:16:15 -0700, Ashutosh Dixit wrote:
>
Hi Matt,
> Introduce 'struct xe_oa_fence' which includes the dma_fence used to signal
> output fences in the xe_sync array. The fences are signaled
> asynchronously. When there are no output fences to signal, the OA
> configuration wait is synchronously re-introduced into the ioctl.
>
> v2: Don't wait in the work, use callback + delayed work (Matt B)
> Use a single, not a per-fence spinlock (Matt Brost)
Want to check with you about this "single, not per-fence spinlock" issue
below.
> v3: Move ofence alloc before job submission (Matt)
> Assert, don't fail, from dma_fence_add_callback (Matt)
> Additional dma_fence_get for dma_fence_wait (Matt)
> Change dma_fence_wait to non-interruptible (Matt)
> v4: Introduce last_fence to prevent uaf if stream is closed with
> pending OA config jobs
>
> Suggested-by: Matthew Brost <matthew.brost at intel.com>
> Reviewed-by: Jonathan Cavitt <jonathan.cavitt at intel.com>
> Signed-off-by: Ashutosh Dixit <ashutosh.dixit at intel.com>
> Reviewed-by: Matthew Brost <matthew.brost at intel.com>
> ---
> drivers/gpu/drm/xe/xe_oa.c | 117 ++++++++++++++++++++++++++-----
> drivers/gpu/drm/xe/xe_oa_types.h | 6 ++
> 2 files changed, 106 insertions(+), 17 deletions(-)
>
> diff --git a/drivers/gpu/drm/xe/xe_oa.c b/drivers/gpu/drm/xe/xe_oa.c
> index b4b68019d35b7..56f95fd431bf9 100644
> --- a/drivers/gpu/drm/xe/xe_oa.c
> +++ b/drivers/gpu/drm/xe/xe_oa.c
> @@ -100,6 +100,15 @@ struct xe_oa_config_bo {
> struct xe_bb *bb;
> };
>
> +struct xe_oa_fence {
> + /* @base: dma fence base */
> + struct dma_fence base;
> + /* @work: work to signal @base */
> + struct delayed_work work;
> + /* @cb: callback to schedule @work */
> + struct dma_fence_cb cb;
> +};
> +
> #define DRM_FMT(x) DRM_XE_OA_FMT_TYPE_##x
>
> static const struct xe_oa_format oa_formats[] = {
> @@ -172,10 +181,10 @@ static struct xe_oa_config *xe_oa_get_oa_config(struct xe_oa *oa, int metrics_se
> return oa_config;
> }
>
> -static void free_oa_config_bo(struct xe_oa_config_bo *oa_bo)
> +static void free_oa_config_bo(struct xe_oa_config_bo *oa_bo, struct dma_fence *last_fence)
> {
> xe_oa_config_put(oa_bo->oa_config);
> - xe_bb_free(oa_bo->bb, NULL);
> + xe_bb_free(oa_bo->bb, last_fence);
> kfree(oa_bo);
> }
>
> @@ -648,7 +657,8 @@ static void xe_oa_free_configs(struct xe_oa_stream *stream)
>
> xe_oa_config_put(stream->oa_config);
> llist_for_each_entry_safe(oa_bo, tmp, stream->oa_config_bos.first, node)
> - free_oa_config_bo(oa_bo);
> + free_oa_config_bo(oa_bo, stream->last_fence);
> + dma_fence_put(stream->last_fence);
> }
>
> static void xe_oa_store_flex(struct xe_oa_stream *stream, struct xe_lrc *lrc,
> @@ -945,40 +955,112 @@ xe_oa_alloc_config_buffer(struct xe_oa_stream *stream, struct xe_oa_config *oa_c
> return oa_bo;
> }
>
> +static void xe_oa_update_last_fence(struct xe_oa_stream *stream, struct dma_fence *fence)
> +{
> + dma_fence_put(stream->last_fence);
> + stream->last_fence = dma_fence_get(fence);
> +}
> +
> +static void xe_oa_fence_work_fn(struct work_struct *w)
> +{
> + struct xe_oa_fence *ofence = container_of(w, typeof(*ofence), work.work);
> +
> + /* Signal fence to indicate new OA configuration is active */
> + dma_fence_signal(&ofence->base);
> + dma_fence_put(&ofence->base);
> +}
> +
> +static void xe_oa_config_cb(struct dma_fence *fence, struct dma_fence_cb *cb)
> +{
> + /* Additional empirical delay needed for NOA programming after registers are written */
> +#define NOA_PROGRAM_ADDITIONAL_DELAY_US 500
> +
> + struct xe_oa_fence *ofence = container_of(cb, typeof(*ofence), cb);
> +
> + INIT_DELAYED_WORK(&ofence->work, xe_oa_fence_work_fn);
> + queue_delayed_work(system_unbound_wq, &ofence->work,
> + usecs_to_jiffies(NOA_PROGRAM_ADDITIONAL_DELAY_US));
> + dma_fence_put(fence);
> +}
> +
> +static const char *xe_oa_get_driver_name(struct dma_fence *fence)
> +{
> + return "xe_oa";
> +}
> +
> +static const char *xe_oa_get_timeline_name(struct dma_fence *fence)
> +{
> + return "unbound";
> +}
> +
> +static const struct dma_fence_ops xe_oa_fence_ops = {
> + .get_driver_name = xe_oa_get_driver_name,
> + .get_timeline_name = xe_oa_get_timeline_name,
> +};
> +
> static int xe_oa_emit_oa_config(struct xe_oa_stream *stream, struct xe_oa_config *config)
> {
> #define NOA_PROGRAM_ADDITIONAL_DELAY_US 500
> struct xe_oa_config_bo *oa_bo;
> - int err = 0, us = NOA_PROGRAM_ADDITIONAL_DELAY_US;
> + struct xe_oa_fence *ofence;
> + int i, err, num_signal = 0;
> struct dma_fence *fence;
> - long timeout;
>
> - /* Emit OA configuration batch */
> + ofence = kzalloc(sizeof(*ofence), GFP_KERNEL);
> + if (!ofence) {
> + err = -ENOMEM;
> + goto exit;
> + }
> +
> oa_bo = xe_oa_alloc_config_buffer(stream, config);
> if (IS_ERR(oa_bo)) {
> err = PTR_ERR(oa_bo);
> goto exit;
> }
>
> + /* Emit OA configuration batch */
> fence = xe_oa_submit_bb(stream, XE_OA_SUBMIT_ADD_DEPS, oa_bo->bb);
> if (IS_ERR(fence)) {
> err = PTR_ERR(fence);
> goto exit;
> }
>
> - /* Wait till all previous batches have executed */
> - timeout = dma_fence_wait_timeout(fence, false, 5 * HZ);
> - dma_fence_put(fence);
> - if (timeout < 0)
> - err = timeout;
> - else if (!timeout)
> - err = -ETIME;
> - if (err)
> - drm_dbg(&stream->oa->xe->drm, "dma_fence_wait_timeout err %d\n", err);
> + /* Point of no return: initialize and set fence to signal */
> + dma_fence_init(&ofence->base, &xe_oa_fence_ops, &stream->oa_fence_lock, 0, 0);
About stream->oa_fence_lock. This was added in response to this comment
from you in v1 of this patch:
/* No per fence spin lock, global OA one for user_signal */
here:
https://patchwork.freedesktop.org/patch/607602/?series=137058&rev=1#comment_1104327
However, I just realized that, the way it is implemented, there is a uaf
here: lock memory can be freed before the lock is accessed in
dma_fence_signal() in xe_oa_fence_work_fn() (when the oa stream is
destroyed).
Of course we can implement some reference counting etc. to solve the uaf,
but the simplest way to solve it seems to be to go back to the per-fence
lock, unless doing so is incorrect? Looking at dma_fence_init calls, I do
see per fence locks being used occasionally (xe_preempt_fence, vgem_fence
etc.).
So just want to check with you if it's ok to go back to the per fence lock,
or we should solve it some other way.
Thanks.
--
Ashutosh
>
> - /* Additional empirical delay needed for NOA programming after registers are written */
> - usleep_range(us, 2 * us);
> + for (i = 0; i < stream->num_syncs; i++) {
> + if (stream->syncs[i].flags & DRM_XE_SYNC_FLAG_SIGNAL)
> + num_signal++;
> + xe_sync_entry_signal(&stream->syncs[i], &ofence->base);
> + }
> +
> + /* Additional dma_fence_get in case we dma_fence_wait */
> + if (!num_signal)
> + dma_fence_get(&ofence->base);
> +
> + /* Update last fence too before adding callback */
> + xe_oa_update_last_fence(stream, fence);
> +
> + /* Add job fence callback to schedule work to signal ofence->base */
> + err = dma_fence_add_callback(fence, &ofence->cb, xe_oa_config_cb);
> + xe_gt_assert(stream->gt, !err || err == -ENOENT);
> + if (err == -ENOENT)
> + xe_oa_config_cb(fence, &ofence->cb);
> +
> + /* If nothing needs to be signaled we wait synchronously */
> + if (!num_signal) {
> + dma_fence_wait(&ofence->base, false);
> + dma_fence_put(&ofence->base);
> + }
> +
> + /* Done with syncs */
> + for (i = 0; i < stream->num_syncs; i++)
> + xe_sync_entry_cleanup(&stream->syncs[i]);
> + kfree(stream->syncs);
> +
> + return 0;
> exit:
> + kfree(ofence);
> return err;
> }
>
> @@ -1479,6 +1561,7 @@ static int xe_oa_stream_init(struct xe_oa_stream *stream,
> goto err_free_oa_buf;
> }
>
> + spin_lock_init(&stream->oa_fence_lock);
> ret = xe_oa_enable_metric_set(stream);
> if (ret) {
> drm_dbg(&stream->oa->xe->drm, "Unable to enable metric set\n");
> diff --git a/drivers/gpu/drm/xe/xe_oa_types.h b/drivers/gpu/drm/xe/xe_oa_types.h
> index 99f4b2d4bdcf6..935f98772a948 100644
> --- a/drivers/gpu/drm/xe/xe_oa_types.h
> +++ b/drivers/gpu/drm/xe/xe_oa_types.h
> @@ -239,6 +239,12 @@ struct xe_oa_stream {
> /** @no_preempt: Whether preemption and timeslicing is disabled for stream exec_q */
> u32 no_preempt;
>
> + /** @last_fence: fence to use in stream destroy when needed */
> + struct dma_fence *last_fence;
> +
> + /** @oa_fence_lock: Lock for struct xe_oa_fence */
> + spinlock_t oa_fence_lock;
> +
> /** @num_syncs: size of @syncs array */
> u32 num_syncs;
>
> --
> 2.41.0
>
More information about the Intel-xe
mailing list