[PATCH 1/1] drm/i915/active: Serialize access to barrier tasks lists

Janusz Krzysztofik janusz.krzysztofik at linux.intel.com
Fri Feb 3 20:45:50 UTC 2023


Hi Andi,

Thanks for having a look.

On Friday, 3 February 2023 20:01:40 CET Andi Shyti wrote:
> On Fri, Feb 03, 2023 at 04:52:41PM +0100, Andrzej Hajda wrote:
> > Hi,
> > 
> > On 03.02.2023 14:48, Janusz Krzysztofik wrote:
> > > Barriers are now deleted from a barrier tasks list by temporarily removing
> > > the list content, traversing that content with skip over the node to be
> > > deleted, then adding the modified content back to the list.  Since that
> > > complex operation is not serialized with other concurrent uses of the
> > > list, functions that depend on the list being either empty or not empty
> > > can take wrong decisions.
> > > 
> > > Protect access to the list pending that complex barrier delete operation
> > > with a lock, and teach other users of the list to acquire the lock.  While
> > > this solution may be considered suboptimal, it seems to be the the
> > > smallest, least invasive fix.
> > 
> > The idea looks OK, just adding locking to lock-less lists looks somehow
> > strange :),
> 
> yes :)
> 
> > on the other side even llist.h suggests locks in case of some scenarios, so
> > maybe not so bad idea.
> > 
> > > 
> > > References: https://gitlab.freedesktop.org/drm/intel/-/issues/6333
> > > Signed-off-by: Janusz Krzysztofik <janusz.krzysztofik at linux.intel.com>
> > > ---
> > >   drivers/gpu/drm/i915/gt/intel_engine_cs.c           | 8 ++++++++
> > >   drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c    | 6 +++++-
> > >   drivers/gpu/drm/i915/gt/intel_engine_pm.c           | 6 +++++-
> > >   drivers/gpu/drm/i915/gt/intel_engine_types.h        | 1 +
> > >   drivers/gpu/drm/i915/gt/selftest_engine_heartbeat.c | 5 +++++
> > >   drivers/gpu/drm/i915/i915_active.c                  | 7 +++++++
> > >   6 files changed, 31 insertions(+), 2 deletions(-)
> > > 
> > > diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> > > index d4e29da74612d..b6fb07c188d16 100644
> > > --- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> > > +++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> > > @@ -1148,6 +1148,7 @@ static int engine_setup_common(struct intel_engine_cs *engine)
> > >   	int err;
> > >   	init_llist_head(&engine->barrier_tasks);
> > > +	spin_lock_init(&engine->barriers_lock);
> > >   	err = init_status_page(engine);
> > >   	if (err)
> > > @@ -1404,6 +1405,8 @@ int intel_engines_init(struct intel_gt *gt)
> > >    */
> > >   void intel_engine_cleanup_common(struct intel_engine_cs *engine)
> > >   {
> > > +	unsigned long flags;
> > > +
> > >   	GEM_BUG_ON(!list_empty(&engine->sched_engine->requests));
> > >   	i915_sched_engine_put(engine->sched_engine);
> > > @@ -1418,7 +1421,9 @@ void intel_engine_cleanup_common(struct intel_engine_cs *engine)
> > >   	if (engine->kernel_context)
> > >   		intel_engine_destroy_pinned_context(engine->kernel_context);
> > > +	spin_lock_irqsave(&engine->barriers_lock, flags);
> > >   	GEM_BUG_ON(!llist_empty(&engine->barrier_tasks));
> > > +	spin_unlock_irqrestore(&engine->barriers_lock, flags);
> > 
> > Since checking of emptiness is quite frequent (5times?), you can add helper
> > for it.
> 
> I don't think we need spinlocks around llist_empty() at all,
> anyway.

Usually not, but here in some cases, where we do want to distinguish between 
the list being really empty, or temporarily empty while being searched for a 
node to be deleted, then we must take the lock to serialize our check against 
__active_del_barrier().  IOW, empty without having the lock taken is not 
reliable (not empty is).

There may be places where that's not really critical, like maybe 
intel_engine_dump(), for example.

Thanks,
Janusz
 
> Andi
> 
> > Regards
> > Andrzej
> > 
> > >   	cleanup_status_page(engine);
> > >   	intel_wa_list_free(&engine->ctx_wa_list);
> > > @@ -2240,6 +2245,7 @@ void intel_engine_dump(struct intel_engine_cs *engine,
> > >   	struct i915_gpu_error * const error = &engine->i915->gpu_error;
> > >   	struct i915_request *rq;
> > >   	intel_wakeref_t wakeref;
> > > +	unsigned long flags;
> > >   	ktime_t dummy;
> > >   	if (header) {
> > > @@ -2254,8 +2260,10 @@ void intel_engine_dump(struct intel_engine_cs *engine,
> > >   		drm_printf(m, "*** WEDGED ***\n");
> > >   	drm_printf(m, "\tAwake? %d\n", atomic_read(&engine->wakeref.count));
> > > +	spin_lock_irqsave(&engine->barriers_lock, flags);
> > >   	drm_printf(m, "\tBarriers?: %s\n",
> > >   		   str_yes_no(!llist_empty(&engine->barrier_tasks)));
> > > +	spin_unlock_irqrestore(&engine->barriers_lock, flags);
> > >   	drm_printf(m, "\tLatency: %luus\n",
> > >   		   ewma__engine_latency_read(&engine->latency));
> > >   	if (intel_engine_supports_stats(engine))
> > > diff --git a/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c
> > > index 9a527e1f5be65..9c3c8c431f8d8 100644
> > > --- a/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c
> > > +++ b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c
> > > @@ -378,9 +378,13 @@ int intel_engine_flush_barriers(struct intel_engine_cs *engine)
> > >   	struct i915_sched_attr attr = { .priority = I915_PRIORITY_MIN };
> > >   	struct intel_context *ce = engine->kernel_context;
> > >   	struct i915_request *rq;
> > > +	unsigned long flags;
> > >   	int err;
> > > -	if (llist_empty(&engine->barrier_tasks))
> > > +	spin_lock_irqsave(&engine->barriers_lock, flags);
> > > +	err = !llist_empty(&engine->barrier_tasks);
> > > +	spin_unlock_irqrestore(&engine->barriers_lock, flags);
> > > +	if (!err)
> > >   		return 0;
> > >   	if (!intel_engine_pm_get_if_awake(engine))
> > > diff --git a/drivers/gpu/drm/i915/gt/intel_engine_pm.c b/drivers/gpu/drm/i915/gt/intel_engine_pm.c
> > > index e971b153fda97..708f8907f410b 100644
> > > --- a/drivers/gpu/drm/i915/gt/intel_engine_pm.c
> > > +++ b/drivers/gpu/drm/i915/gt/intel_engine_pm.c
> > > @@ -235,8 +235,12 @@ static bool switch_to_kernel_context(struct intel_engine_cs *engine)
> > >   static void call_idle_barriers(struct intel_engine_cs *engine)
> > >   {
> > >   	struct llist_node *node, *next;
> > > +	unsigned long flags;
> > > -	llist_for_each_safe(node, next, llist_del_all(&engine->barrier_tasks)) {
> > > +	spin_lock_irqsave(&engine->barriers_lock, flags);
> > > +	node = llist_del_all(&engine->barrier_tasks);
> > > +	spin_unlock_irqrestore(&engine->barriers_lock, flags);
> > > +	llist_for_each_safe(node, next, node) {
> > >   		struct dma_fence_cb *cb =
> > >   			container_of((struct list_head *)node,
> > >   				     typeof(*cb), node);
> > > diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h
> > > index 4fd54fb8810fb..ab9e0a6de70d4 100644
> > > --- a/drivers/gpu/drm/i915/gt/intel_engine_types.h
> > > +++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h
> > > @@ -396,6 +396,7 @@ struct intel_engine_cs {
> > >   	struct intel_context *hung_ce;
> > >   	struct llist_head barrier_tasks;
> > > +	spinlock_t barriers_lock;
> > >   	struct intel_context *kernel_context; /* pinned */
> > > diff --git a/drivers/gpu/drm/i915/gt/selftest_engine_heartbeat.c b/drivers/gpu/drm/i915/gt/selftest_engine_heartbeat.c
> > > index 273d440a53e3f..cbc03662fc693 100644
> > > --- a/drivers/gpu/drm/i915/gt/selftest_engine_heartbeat.c
> > > +++ b/drivers/gpu/drm/i915/gt/selftest_engine_heartbeat.c
> > > @@ -90,6 +90,7 @@ static void pulse_unlock_wait(struct pulse *p)
> > >   static int __live_idle_pulse(struct intel_engine_cs *engine,
> > >   			     int (*fn)(struct intel_engine_cs *cs))
> > >   {
> > > +	unsigned long flags;
> > >   	struct pulse *p;
> > >   	int err;
> > > @@ -113,13 +114,17 @@ static int __live_idle_pulse(struct intel_engine_cs *engine,
> > >   	i915_active_release(&p->active);
> > >   	GEM_BUG_ON(i915_active_is_idle(&p->active));
> > > +	spin_lock_irqsave(&engine->barriers_lock, flags);
> > >   	GEM_BUG_ON(llist_empty(&engine->barrier_tasks));
> > > +	spin_unlock_irqrestore(&engine->barriers_lock, flags);
> > >   	err = fn(engine);
> > >   	if (err)
> > >   		goto out;
> > > +	spin_lock_irqsave(&engine->barriers_lock, flags);
> > >   	GEM_BUG_ON(!llist_empty(&engine->barrier_tasks));
> > > +	spin_unlock_irqrestore(&engine->barriers_lock, flags);
> > >   	if (engine_sync_barrier(engine)) {
> > >   		struct drm_printer m = drm_err_printer("pulse");
> > > diff --git a/drivers/gpu/drm/i915/i915_active.c b/drivers/gpu/drm/i915/i915_active.c
> > > index 7412abf166a8c..320840eea30ce 100644
> > > --- a/drivers/gpu/drm/i915/i915_active.c
> > > +++ b/drivers/gpu/drm/i915/i915_active.c
> > > @@ -370,6 +370,7 @@ static bool ____active_del_barrier(struct i915_active *ref,
> > >   {
> > >   	struct llist_node *head = NULL, *tail = NULL;
> > >   	struct llist_node *pos, *next;
> > > +	unsigned long flags;
> > >   	GEM_BUG_ON(node->timeline != engine->kernel_context->timeline->fence_context);
> > > @@ -388,6 +389,7 @@ static bool ____active_del_barrier(struct i915_active *ref,
> > >   	 * we are actively using the barrier, we know that there will be
> > >   	 * at least another opportunity when we idle.
> > >   	 */
> > > +	spin_lock_irqsave(&engine->barriers_lock, flags);
> > >   	llist_for_each_safe(pos, next, llist_del_all(&engine->barrier_tasks)) {
> > >   		if (node == barrier_from_ll(pos)) {
> > >   			node = NULL;
> > > @@ -401,6 +403,7 @@ static bool ____active_del_barrier(struct i915_active *ref,
> > >   	}
> > >   	if (head)
> > >   		llist_add_batch(head, tail, &engine->barrier_tasks);
> > > +	spin_unlock_irqrestore(&engine->barriers_lock, flags);
> > >   	return !node;
> > >   }
> > > @@ -973,7 +976,9 @@ void i915_active_acquire_barrier(struct i915_active *ref)
> > >   		spin_unlock_irqrestore(&ref->tree_lock, flags);
> > >   		GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
> > > +		spin_lock_irqsave(&engine->barriers_lock, flags);
> > >   		llist_add(barrier_to_ll(node), &engine->barrier_tasks);
> > > +		spin_unlock_irqrestore(&engine->barriers_lock, flags);
> > >   		intel_engine_pm_put_delay(engine, 2);
> > >   	}
> > >   }
> > > @@ -993,7 +998,9 @@ void i915_request_add_active_barriers(struct i915_request *rq)
> > >   	GEM_BUG_ON(intel_engine_is_virtual(engine));
> > >   	GEM_BUG_ON(i915_request_timeline(rq) != engine->kernel_context->timeline);
> > > +	spin_lock_irqsave(&engine->barriers_lock, flags);
> > >   	node = llist_del_all(&engine->barrier_tasks);
> > > +	spin_unlock_irqrestore(&engine->barriers_lock, flags);
> > >   	if (!node)
> > >   		return;
> > >   	/*
> 






More information about the Intel-gfx-trybot mailing list