[Intel-xe] [PATCH] drm/xe: Remove unnecessary memory barriers related to wait queue

Fri Jul 28 12:13:04 UTC 2023

On Thu, 27 Jul 2023 at 19:34, Matthew Brost <matthew.brost at intel.com> wrote:
>
> These not needed per the wait queue doc, remove them.
>
> Signed-off-by: Matthew Brost <matthew.brost at intel.com>
> ---
>  drivers/gpu/drm/xe/xe_guc_ct.c     | 3 ---
>  drivers/gpu/drm/xe/xe_guc_submit.c | 9 +--------
>  2 files changed, 1 insertion(+), 11 deletions(-)
>
> diff --git a/drivers/gpu/drm/xe/xe_guc_ct.c b/drivers/gpu/drm/xe/xe_guc_ct.c
> index cb75db30800c..3f58902b6be3 100644
> --- a/drivers/gpu/drm/xe/xe_guc_ct.c
> +++ b/drivers/gpu/drm/xe/xe_guc_ct.c
> @@ -308,7 +308,6 @@ int xe_guc_ct_enable(struct xe_guc_ct *ct)
>         spin_unlock_irq(&ct->fast_lock);
>         mutex_unlock(&ct->lock);
>
> -       smp_mb();
>         wake_up_all(&ct->wq);
>         drm_dbg(&xe->drm, "GuC CT communication channel enabled\n");
>
> @@ -820,8 +819,6 @@ static int parse_g2h_response(struct xe_guc_ct *ct, u32 *msg, u32 len)
>         g2h_release_space(ct, GUC_CTB_HXG_MSG_MAX_LEN);
>
>         g2h_fence->done = true;
> -       smp_mb();

Hmm, pretty sure we still need barriers such that done = 1 is seen to
happen after the stores for fail, error, hint, retry etc, and then ofc
a matching barrier on the reader side. The barriers in wake_up_all()
vs wait_event() won't ensure that. So something like:

parse_g2h_response():
/*
 * Pairs with guc_ct_send_recv().
 *
 * The ordering of g2h_fence.done vs wake_up_all() and
 * wait_event_timeout() is already handled by that api. However here we
 * also have the added dependency of ordering the g2h_fence.done after
 * the previous stores, like with g2h_fence.retry, g2h_fence.fail etc.
 * which we need to handle ourselves.
 */
smp_wmb();
WRITE_ONCE(g2h_fence->done, true);
wake_up_all(&ct->g2h_fence_wq);

guc_ct_send_recv():
ret = wait_event_timeout(ct->g2h_fence_wq, READ_ONCE(g2h_fence.done), HZ);
if (!ret) {
}

/* Pairs with parse_g2h_response(). */
smp_rmb();
if (READ_ONCE(g2h_fence.retry)) {
}

if (READ_ONCE(g2h_fence.fail)) {
}

But AFAICT there also seems to be scary lifetime issues in here. The
g2h_fence is allocated on the stack, and goes out of scope when
returning from guc_ct_send_recv(), however if the wait_event timedout
we don't know if parse_g2h_response() will still run before we remove
the fence from the xarray. I think we need to grab the ct->mutex to
serialize the operations. Also if we have to acquire ct->mutex anyway
it might be worth seeing if we can drop the scary shared memory access
outside locks stuff here.

> -
>         wake_up_all(&ct->g2h_fence_wq);
>
>         return 0;
> diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c
> index 6cb64a097297..d623cd9f6cc4 100644
> --- a/drivers/gpu/drm/xe/xe_guc_submit.c
> +++ b/drivers/gpu/drm/xe/xe_guc_submit.c
> @@ -705,7 +705,6 @@ static void disable_scheduling_deregister(struct xe_guc *guc,
>         int ret;
>
>         set_min_preemption_timeout(guc, e);
> -       smp_rmb();
>         ret = wait_event_timeout(guc->ct.wq, !engine_pending_enable(e) ||
>                                  guc_read_stopped(guc), HZ * 5);
>         if (!ret) {
> @@ -888,7 +887,6 @@ guc_engine_timedout_job(struct drm_sched_job *drm_job)
>                  * error) messages which can cause the schedule disable to get
>                  * lost. If this occurs, trigger a GT reset to recover.
>                  */
> -               smp_rmb();
>                 ret = wait_event_timeout(guc->ct.wq,
>                                          !engine_pending_disable(e) ||
>                                          guc_read_stopped(guc), HZ * 5);
> @@ -1008,7 +1006,6 @@ static void suspend_fence_signal(struct xe_engine *e)
>         XE_BUG_ON(!e->guc->suspend_pending);
>
>         e->guc->suspend_pending = false;

Probably a good idea to use WRITE_ONCE/READ_ONCE. Even if it is not
strictly needed from the compiler pov, it at the very least documents
that we are accessing shared memory outside of proper locking.

> -       smp_wmb();
>         wake_up(&e->guc->suspend_wait);
>  }
>
> @@ -1392,7 +1389,6 @@ int xe_guc_submit_reset_prepare(struct xe_guc *guc)
>          * and releasing any TDRs waiting on guc->submission_state.stopped.
>          */
>         ret = atomic_fetch_or(1, &guc->submission_state.stopped);
> -       smp_wmb();
>         wake_up_all(&guc->ct.wq);
>
>         return ret;
> @@ -1521,17 +1517,14 @@ int xe_guc_sched_done_handler(struct xe_guc *guc, u32 *msg, u32 len)
>         if (engine_pending_enable(e)) {
>                 e->guc->resume_time = ktime_get();
>                 clear_engine_pending_enable(e);
> -               smp_wmb();
>                 wake_up_all(&guc->ct.wq);
>         } else {
>                 clear_engine_pending_disable(e);
>                 if (e->guc->suspend_pending) {
>                         suspend_fence_signal(e);
>                 } else {
> -                       if (engine_banned(e)) {
> -                               smp_wmb();
> +                       if (engine_banned(e))
>                                 wake_up_all(&guc->ct.wq);
> -                       }
>                         deregister_engine(guc, e);
>                 }
>         }
> --
> 2.34.1
>