[PATCH] nouveau: rip out fence irq allow/block sequences.
Dave Airlie
airlied at gmail.com
Thu Jan 25 21:55:55 UTC 2024
On Fri, 26 Jan 2024 at 04:28, Daniel Vetter <daniel at ffwll.ch> wrote:
>
> On Tue, Jan 23, 2024 at 05:25:38PM +1000, Dave Airlie wrote:
> > From: Dave Airlie <airlied at redhat.com>
> >
> > fences are signalled on nvidia hw using non-stall interrupts.
> >
> > non-stall interrupts are not latched from my reading.
> >
> > When nouveau emits a fence, it requests a NON_STALL signalling,
> > but it only calls the interface to allow the non-stall irq to happen
> > after it has already emitted the fence. A recent change
> > eacabb546271 ("nouveau: push event block/allowing out of the fence context")
> > made this worse by pushing out the fence allow/block to a workqueue.
> >
> > However I can't see how this could ever work great, since when
> > enable signalling is called, the semaphore has already been emitted
> > to the ring, and the hw could already have tried to set the bits,
> > but it's been masked off. Changing the allowed mask later won't make
> > the interrupt get called again.
> >
> > For now rip all of this out.
> >
> > This fixes a bunch of stalls seen running VK CTS sync tests.
> >
> > Signed-off-by: Dave Airlie <airlied at redhat.com>
> > ---
> > drivers/gpu/drm/nouveau/nouveau_fence.c | 77 +++++--------------------
> > drivers/gpu/drm/nouveau/nouveau_fence.h | 2 -
> > 2 files changed, 16 insertions(+), 63 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/nouveau/nouveau_fence.c b/drivers/gpu/drm/nouveau/nouveau_fence.c
> > index 5057d976fa57..d6d50cdccf75 100644
> > --- a/drivers/gpu/drm/nouveau/nouveau_fence.c
> > +++ b/drivers/gpu/drm/nouveau/nouveau_fence.c
> > @@ -50,24 +50,14 @@ nouveau_fctx(struct nouveau_fence *fence)
> > return container_of(fence->base.lock, struct nouveau_fence_chan, lock);
> > }
> >
> > -static int
> > +static void
> > nouveau_fence_signal(struct nouveau_fence *fence)
> > {
> > - int drop = 0;
> > -
> > dma_fence_signal_locked(&fence->base);
> > list_del(&fence->head);
> > rcu_assign_pointer(fence->channel, NULL);
> >
> > - if (test_bit(DMA_FENCE_FLAG_USER_BITS, &fence->base.flags)) {
> > - struct nouveau_fence_chan *fctx = nouveau_fctx(fence);
> > -
> > - if (atomic_dec_and_test(&fctx->notify_ref))
> > - drop = 1;
> > - }
> > -
> > dma_fence_put(&fence->base);
> > - return drop;
> > }
> >
> > static struct nouveau_fence *
> > @@ -93,8 +83,7 @@ nouveau_fence_context_kill(struct nouveau_fence_chan *fctx, int error)
> > if (error)
> > dma_fence_set_error(&fence->base, error);
> >
> > - if (nouveau_fence_signal(fence))
> > - nvif_event_block(&fctx->event);
> > + nouveau_fence_signal(fence);
> > }
> > fctx->killed = 1;
> > spin_unlock_irqrestore(&fctx->lock, flags);
> > @@ -103,8 +92,8 @@ nouveau_fence_context_kill(struct nouveau_fence_chan *fctx, int error)
> > void
> > nouveau_fence_context_del(struct nouveau_fence_chan *fctx)
> > {
> > - cancel_work_sync(&fctx->allow_block_work);
> > nouveau_fence_context_kill(fctx, 0);
> > + nvif_event_block(&fctx->event);
> > nvif_event_dtor(&fctx->event);
> > fctx->dead = 1;
> >
> > @@ -127,11 +116,10 @@ nouveau_fence_context_free(struct nouveau_fence_chan *fctx)
> > kref_put(&fctx->fence_ref, nouveau_fence_context_put);
> > }
> >
> > -static int
> > +static void
> > nouveau_fence_update(struct nouveau_channel *chan, struct nouveau_fence_chan *fctx)
> > {
> > struct nouveau_fence *fence;
> > - int drop = 0;
> > u32 seq = fctx->read(chan);
> >
> > while (!list_empty(&fctx->pending)) {
> > @@ -140,10 +128,8 @@ nouveau_fence_update(struct nouveau_channel *chan, struct nouveau_fence_chan *fc
> > if ((int)(seq - fence->base.seqno) < 0)
> > break;
> >
> > - drop |= nouveau_fence_signal(fence);
> > + nouveau_fence_signal(fence);
> > }
> > -
> > - return drop;
> > }
> >
> > static int
> > @@ -160,26 +146,13 @@ nouveau_fence_wait_uevent_handler(struct nvif_event *event, void *repv, u32 repc
> >
> > fence = list_entry(fctx->pending.next, typeof(*fence), head);
> > chan = rcu_dereference_protected(fence->channel, lockdep_is_held(&fctx->lock));
> > - if (nouveau_fence_update(chan, fctx))
> > - ret = NVIF_EVENT_DROP;
> > + nouveau_fence_update(chan, fctx);
> > }
> > spin_unlock_irqrestore(&fctx->lock, flags);
> >
> > return ret;
> > }
> >
> > -static void
> > -nouveau_fence_work_allow_block(struct work_struct *work)
> > -{
> > - struct nouveau_fence_chan *fctx = container_of(work, struct nouveau_fence_chan,
> > - allow_block_work);
> > -
> > - if (atomic_read(&fctx->notify_ref) == 0)
> > - nvif_event_block(&fctx->event);
> > - else
> > - nvif_event_allow(&fctx->event);
> > -}
> > -
> > void
> > nouveau_fence_context_new(struct nouveau_channel *chan, struct nouveau_fence_chan *fctx)
> > {
> > @@ -191,7 +164,6 @@ nouveau_fence_context_new(struct nouveau_channel *chan, struct nouveau_fence_cha
> > } args;
> > int ret;
> >
> > - INIT_WORK(&fctx->allow_block_work, nouveau_fence_work_allow_block);
> > INIT_LIST_HEAD(&fctx->flip);
> > INIT_LIST_HEAD(&fctx->pending);
> > spin_lock_init(&fctx->lock);
> > @@ -216,6 +188,12 @@ nouveau_fence_context_new(struct nouveau_channel *chan, struct nouveau_fence_cha
> > &args.base, sizeof(args), &fctx->event);
> >
> > WARN_ON(ret);
> > +
> > + /*
> > + * Always allow non-stall irq events - previously this code tried to
> > + * enable/disable them, but that just seems racy as nonstall irqs are unlatched.
> > + */
> > + nvif_event_allow(&fctx->event);
> > }
> >
> > int
> > @@ -247,8 +225,7 @@ nouveau_fence_emit(struct nouveau_fence *fence)
> > return -ENODEV;
> > }
> >
> > - if (nouveau_fence_update(chan, fctx))
> > - nvif_event_block(&fctx->event);
> > + nouveau_fence_update(chan, fctx);
> >
> > list_add_tail(&fence->head, &fctx->pending);
> > spin_unlock_irq(&fctx->lock);
> > @@ -271,8 +248,8 @@ nouveau_fence_done(struct nouveau_fence *fence)
> >
> > spin_lock_irqsave(&fctx->lock, flags);
> > chan = rcu_dereference_protected(fence->channel, lockdep_is_held(&fctx->lock));
> > - if (chan && nouveau_fence_update(chan, fctx))
> > - nvif_event_block(&fctx->event);
> > + if (chan)
> > + nouveau_fence_update(chan, fctx);
> > spin_unlock_irqrestore(&fctx->lock, flags);
> > }
> > return dma_fence_is_signaled(&fence->base);
> > @@ -530,32 +507,10 @@ static const struct dma_fence_ops nouveau_fence_ops_legacy = {
> > .release = nouveau_fence_release
> > };
> >
> > -static bool nouveau_fence_enable_signaling(struct dma_fence *f)
> > -{
> > - struct nouveau_fence *fence = from_fence(f);
> > - struct nouveau_fence_chan *fctx = nouveau_fctx(fence);
> > - bool ret;
> > - bool do_work;
> > -
> > - if (atomic_inc_return(&fctx->notify_ref) == 0)
> > - do_work = true;
> > -
> > - ret = nouveau_fence_no_signaling(f);
> > - if (ret)
> > - set_bit(DMA_FENCE_FLAG_USER_BITS, &fence->base.flags);
> > - else if (atomic_dec_and_test(&fctx->notify_ref))
> > - do_work = true;
> > -
> > - if (do_work)
> > - schedule_work(&fctx->allow_block_work);
> > -
> > - return ret;
> > -}
> > -
> > static const struct dma_fence_ops nouveau_fence_ops_uevent = {
> > .get_driver_name = nouveau_fence_get_get_driver_name,
> > .get_timeline_name = nouveau_fence_get_timeline_name,
> > - .enable_signaling = nouveau_fence_enable_signaling,
> > + .enable_signaling = nouveau_fence_no_signaling,
>
> I think you can rip nouveau_fence_no_signaling out too, it doesn't do
> anything more than what the signalling codepath does too.
>
> But maybe separate path since maybe this makes an existing leak more of a
> sieve, but it really should be an existing one since you cannot assume
> that someone external will ever look at whether your fence is signalled or
> not.
> -Sima
>
I think it might be overkill to rip this out, but the fix I put in 6.7
is also having bad side effects, so I'm going to try and revert that
and fix that problem first.
I think I'd like to keep this irq handling stuff as it seems to
matter, but I think the atomic in fctx is wrongly handled and it's a
case of misusing atomics instead of locks and I'm going to spend next
week considering it in a bit more depth.
Dave.
More information about the Nouveau
mailing list