[PATCH 4/7] dma-buf: add dma_fence_chain_garbage_collect

Daniel Vetter daniel at ffwll.ch
Fri Jun 11 15:11:53 UTC 2021


On Fri, Jun 11, 2021 at 12:07:00PM +0200, Christian König wrote:
> Am 11.06.21 um 10:58 schrieb Daniel Vetter:
> > On Thu, Jun 10, 2021 at 11:17:57AM +0200, Christian König wrote:
> > > Add some rather sophisticated lockless garbage collection
> > > for dma_fence_chain objects.
> > > 
> > > For this keep all initialized dma_fence_chain nodes an a
> > > queue and trigger garbage collection before a new one is
> > > allocated.
> > > 
> > > Signed-off-by: Christian König <christian.koenig at amd.com>
> > Uh hand-rolled lockless list, I'm not a fan.
> > 
> > But the real question here is why? This is a global list, so it's going to
> > look great on your desktop, but gpus are for servers now and those are
> > NUMA. So just from that pov doing garbage-collection individually still
> > feels like a much better idea.
> 
> Yeah, I was pondering on that quite a bit as well. That why I added the
> multi threaded alloc/free test.
> 
> > So what's the problem your trying to solve here?
> 
> I was not sure if the chain is garbage collected enough when used for the
> dma_resv exclusive object.
> 
> But I think we could just drop this for now and just see how it goes.

Yeah I think fixing a perf/tuning problem when we're hitting it is much
better than trying to guess what it might be and writing something really
clever to anticipate that. We generally get it wrong.

Also with the explaination of what it tests added the testcase makes
sense. Just some documentation or comment would have helped.
-Daniel

> 
> Christian.
> 
> > -Daniel
> > 
> > > ---
> > >   drivers/dma-buf/dma-fence-chain.c | 160 +++++++++++++++++++++++++-----
> > >   include/linux/dma-fence-chain.h   |   5 +
> > >   2 files changed, 142 insertions(+), 23 deletions(-)
> > > 
> > > diff --git a/drivers/dma-buf/dma-fence-chain.c b/drivers/dma-buf/dma-fence-chain.c
> > > index 1b4cb3e5cec9..c2f0b69eabb7 100644
> > > --- a/drivers/dma-buf/dma-fence-chain.c
> > > +++ b/drivers/dma-buf/dma-fence-chain.c
> > > @@ -9,8 +9,53 @@
> > >   #include <linux/dma-fence-chain.h>
> > > +static struct dma_fence_chain __rcu *fifo_front;
> > > +static struct dma_fence_chain __rcu **fifo_back = &fifo_front;
> > > +
> > >   static bool dma_fence_chain_enable_signaling(struct dma_fence *fence);
> > > +/**
> > > + * dma_fence_chain_enqueue - enqeue a chain node for garbage collection
> > > + * @chain: the chain node to enqueue
> > > + *
> > > + * Add the chain node to the end of the gc fifo.
> > > + */
> > > +static void dma_fence_chain_enqueue(struct dma_fence_chain *chain)
> > > +{
> > > +	struct dma_fence_chain __rcu **tmp;
> > > +
> > > +	RCU_INIT_POINTER(chain->next, NULL);
> > > +	tmp = xchg((struct dma_fence_chain __force ***)&fifo_back,
> > > +		   &chain->next);
> > > +
> > > +	/* This is intentionally unordered since we only need the fifo for gc */
> > > +	rcu_assign_pointer(*tmp, chain);
> > > +}
> > > +
> > > +/**
> > > + * dma_fence_chain_dequeue - deqeueue a chain node for garbage collection
> > > + *
> > > + * Remove the first chain node from the gc fifo while making sure that always
> > > + * keep at least one node on the fifo for lockless fifo implementation.
> > > + * Returns the dequeued chain node or NULL.
> > > + */
> > > +static struct dma_fence_chain *dma_fence_chain_dequeue(void)
> > > +{
> > > +	struct dma_fence_chain *chain, *tmp;
> > > +
> > > +	rcu_read_lock();
> > > +	chain = rcu_dereference(fifo_front);
> > > +	/* Never dequeue the last chain node for lockless fifo */
> > > +	if (unlikely(!chain || !rcu_access_pointer(chain->next))) {
> > > +		rcu_read_unlock();
> > > +		return NULL;
> > > +	}
> > > +	tmp = cmpxchg((struct dma_fence_chain __force **)&fifo_front,
> > > +		      chain, rcu_access_pointer(chain->next));
> > > +	rcu_read_unlock();
> > > +	return tmp == chain ? chain : NULL;
> > > +}
> > > +
> > >   /**
> > >    * dma_fence_chain_get_prev - use RCU to get a reference to the previous fence
> > >    * @chain: chain node to get the previous node from
> > > @@ -28,6 +73,43 @@ static struct dma_fence *dma_fence_chain_get_prev(struct dma_fence_chain *chain)
> > >   	return prev;
> > >   }
> > > +/**
> > > + * dma_fence_chain_try_replace - try to replace the prev node
> > > + * @chain: Chain node we try to replace prev.
> > > + * @prev: the old prev node
> > > + *
> > > + * Try to replace the previous chain node when it or its containing fence is
> > > + * signaled. Returns true if we tried, false if we need to wait.
> > > + */
> > > +static bool dma_fence_chain_try_replace(struct dma_fence_chain *chain,
> > > +					struct dma_fence *prev)
> > > +{
> > > +	struct dma_fence *replacement, *tmp;
> > > +	struct dma_fence_chain *prev_chain;
> > > +
> > > +	prev_chain = to_dma_fence_chain(prev);
> > > +	if (prev_chain) {
> > > +		if (!dma_fence_is_signaled(prev_chain->fence))
> > > +			return false;
> > > +
> > > +		replacement = dma_fence_chain_get_prev(prev_chain);
> > > +	} else {
> > > +		if (!dma_fence_is_signaled(prev))
> > > +			return false;
> > > +
> > > +		replacement = NULL;
> > > +	}
> > > +
> > > +	tmp = cmpxchg((struct dma_fence __force **)&chain->prev, prev,
> > > +		      replacement);
> > > +	if (tmp == prev)
> > > +		dma_fence_put(tmp);
> > > +	else
> > > +		dma_fence_put(replacement);
> > > +
> > > +	return true;
> > > +}
> > > +
> > >   /**
> > >    * dma_fence_chain_walk - chain walking function
> > >    * @fence: current chain node
> > > @@ -38,8 +120,8 @@ static struct dma_fence *dma_fence_chain_get_prev(struct dma_fence_chain *chain)
> > >    */
> > >   struct dma_fence *dma_fence_chain_walk(struct dma_fence *fence)
> > >   {
> > > -	struct dma_fence_chain *chain, *prev_chain;
> > > -	struct dma_fence *prev, *replacement, *tmp;
> > > +	struct dma_fence_chain *chain;
> > > +	struct dma_fence *prev;
> > >   	chain = to_dma_fence_chain(fence);
> > >   	if (!chain) {
> > > @@ -48,26 +130,8 @@ struct dma_fence *dma_fence_chain_walk(struct dma_fence *fence)
> > >   	}
> > >   	while ((prev = dma_fence_chain_get_prev(chain))) {
> > > -
> > > -		prev_chain = to_dma_fence_chain(prev);
> > > -		if (prev_chain) {
> > > -			if (!dma_fence_is_signaled(prev_chain->fence))
> > > -				break;
> > > -
> > > -			replacement = dma_fence_chain_get_prev(prev_chain);
> > > -		} else {
> > > -			if (!dma_fence_is_signaled(prev))
> > > -				break;
> > > -
> > > -			replacement = NULL;
> > > -		}
> > > -
> > > -		tmp = cmpxchg((struct dma_fence __force **)&chain->prev,
> > > -			      prev, replacement);
> > > -		if (tmp == prev)
> > > -			dma_fence_put(tmp);
> > > -		else
> > > -			dma_fence_put(replacement);
> > > +		if (!dma_fence_chain_try_replace(chain, prev))
> > > +			break;
> > >   		dma_fence_put(prev);
> > >   	}
> > > @@ -205,7 +269,21 @@ static void dma_fence_chain_release(struct dma_fence *fence)
> > >   	dma_fence_put(prev);
> > >   	dma_fence_put(chain->fence);
> > > -	dma_fence_free(fence);
> > > +	WRITE_ONCE(chain->fence, NULL);
> > > +
> > > +	/*
> > > +	 * Don't garbage collect here to avoid recursion and potential stack
> > > +	 * overflow.
> > > +	 */
> > > +	chain = dma_fence_chain_dequeue();
> > > +	if (!chain)
> > > +		return;
> > > +
> > > +	if (kref_read(&chain->base.refcount) ||
> > > +	    READ_ONCE(chain->fence))
> > > +		dma_fence_chain_enqueue(chain);
> > > +	else
> > > +		dma_fence_free(&chain->base);
> > >   }
> > >   const struct dma_fence_ops dma_fence_chain_ops = {
> > > @@ -218,6 +296,40 @@ const struct dma_fence_ops dma_fence_chain_ops = {
> > >   };
> > >   EXPORT_SYMBOL(dma_fence_chain_ops);
> > > +/**
> > > + * dma_fence_chain_garbage_collect - cleanup chain nodes
> > > + *
> > > + * Do some garbage collection and try to release chain nodes.
> > > + */
> > > +void dma_fence_chain_garbage_collect(void)
> > > +{
> > > +	struct dma_fence_chain *chain = dma_fence_chain_dequeue();
> > > +
> > > +	if (!chain)
> > > +		return;
> > > +
> > > +	if (!dma_fence_get_rcu(&chain->base)) {
> > > +		/* Unused, check if it's also clean */
> > > +		if (likely(!READ_ONCE(chain->fence))) {
> > > +			dma_fence_free(&chain->base);
> > > +			return;
> > > +		}
> > > +
> > > +	} else {
> > > +		struct dma_fence *prev;
> > > +
> > > +		/* Used, do some chain walk */
> > > +		prev = dma_fence_chain_get_prev(chain);
> > > +		if (prev) {
> > > +			dma_fence_chain_try_replace(chain, prev);
> > > +			dma_fence_put(prev);
> > > +		}
> > > +		dma_fence_put(&chain->base);
> > > +	}
> > > +	dma_fence_chain_enqueue(chain);
> > > +}
> > > +EXPORT_SYMBOL(dma_fence_chain_garbage_collect);
> > > +
> > >   /**
> > >    * dma_fence_chain_init - initialize a fence chain
> > >    * @chain: the chain node to initialize
> > > @@ -254,5 +366,7 @@ void dma_fence_chain_init(struct dma_fence_chain *chain,
> > >   	dma_fence_init(&chain->base, &dma_fence_chain_ops,
> > >   		       &chain->lock, context, seqno);
> > > +	dma_fence_chain_enqueue(chain);
> > >   }
> > > +
> > >   EXPORT_SYMBOL(dma_fence_chain_init);
> > > diff --git a/include/linux/dma-fence-chain.h b/include/linux/dma-fence-chain.h
> > > index 5f45689a6d2e..b412b5396434 100644
> > > --- a/include/linux/dma-fence-chain.h
> > > +++ b/include/linux/dma-fence-chain.h
> > > @@ -19,6 +19,7 @@
> > >    * @base: fence base class
> > >    * @lock: spinlock for fence handling
> > >    * @prev: previous fence of the chain
> > > + * @next: next chain node for garbage collection
> > >    * @prev_seqno: original previous seqno before garbage collection
> > >    * @fence: encapsulated fence
> > >    * @cb: callback structure for signaling
> > > @@ -27,6 +28,7 @@
> > >   struct dma_fence_chain {
> > >   	struct dma_fence base;
> > >   	struct dma_fence __rcu *prev;
> > > +	struct dma_fence_chain __rcu *next;
> > >   	u64 prev_seqno;
> > >   	struct dma_fence *fence;
> > >   	union {
> > > @@ -38,6 +40,8 @@ struct dma_fence_chain {
> > >   extern const struct dma_fence_ops dma_fence_chain_ops;
> > > +void dma_fence_chain_garbage_collect(void);
> > > +
> > >   /**
> > >    * to_dma_fence_chain - cast a fence to a dma_fence_chain
> > >    * @fence: fence to cast to a dma_fence_array
> > > @@ -61,6 +65,7 @@ to_dma_fence_chain(struct dma_fence *fence)
> > >    */
> > >   static inline struct dma_fence_chain *dma_fence_chain_alloc(void)
> > >   {
> > > +	dma_fence_chain_garbage_collect();
> > >   	return kmalloc(sizeof(struct dma_fence_chain), GFP_KERNEL);
> > >   };
> > > -- 
> > > 2.25.1
> > > 
> 

-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch


More information about the amd-gfx mailing list