[PATCH 03/18] drm/xe/eudebug: Introduce discovery for resources

Matthew Brost matthew.brost at intel.com
Thu Oct 3 16:32:17 UTC 2024


On Thu, Oct 03, 2024 at 10:27:52AM +0300, Mika Kuoppala wrote:
> Matthew Brost <matthew.brost at intel.com> writes:
> 
> > On Tue, Oct 01, 2024 at 05:42:51PM +0300, Mika Kuoppala wrote:
> >> Debugger connection can happen way after the client has
> >> created and destroyed arbitrary number of resources.
> >> 
> >> We need to playback all currently existing resources for the
> >> debugger. The client is held until this so called discovery
> >> process, executed by workqueue, is complete.
> >> 
> >> This patch is based on discovery work by Maciej Patelczyk
> >> for i915 driver.
> >> 
> >> v2: - use rw_semaphore to block drm_ioctls during discovery (Matthew)
> >>     - only lock according to ioctl at play (Dominik)
> >> 
> >> Cc: Matthew Brost <matthew.brost at intel.com>
> >> Cc: Dominik Grzegorzek <dominik.grzegorzek at intel.com>
> >> Co-developed-by: Maciej Patelczyk <maciej.patelczyk at intel.com>
> >> Signed-off-by: Maciej Patelczyk <maciej.patelczyk at intel.com>
> >> Signed-off-by: Mika Kuoppala <mika.kuoppala at linux.intel.com>
> >> ---
> >>  drivers/gpu/drm/xe/xe_device.c        |  10 +-
> >>  drivers/gpu/drm/xe/xe_device.h        |  34 +++++++
> >>  drivers/gpu/drm/xe/xe_device_types.h  |   6 ++
> >>  drivers/gpu/drm/xe/xe_eudebug.c       | 135 +++++++++++++++++++++++++-
> >>  drivers/gpu/drm/xe/xe_eudebug_types.h |   7 ++
> >>  5 files changed, 185 insertions(+), 7 deletions(-)
> >> 
> >> diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c
> >> index 5615e2c23bf6..ec5eedbbf320 100644
> >> --- a/drivers/gpu/drm/xe/xe_device.c
> >> +++ b/drivers/gpu/drm/xe/xe_device.c
> >> @@ -215,8 +215,11 @@ static long xe_drm_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
> >>  		return -ECANCELED;
> >>  
> >>  	ret = xe_pm_runtime_get_ioctl(xe);
> >> -	if (ret >= 0)
> >> +	if (ret >= 0) {
> >> +		xe_eudebug_discovery_lock(xe, cmd);
> >>  		ret = drm_ioctl(file, cmd, arg);
> >> +		xe_eudebug_discovery_unlock(xe, cmd);
> >> +	}
> >>  	xe_pm_runtime_put(xe);
> >>  
> >>  	return ret;
> >> @@ -233,8 +236,11 @@ static long xe_drm_compat_ioctl(struct file *file, unsigned int cmd, unsigned lo
> >>  		return -ECANCELED;
> >>  
> >>  	ret = xe_pm_runtime_get_ioctl(xe);
> >> -	if (ret >= 0)
> >> +	if (ret >= 0) {
> >> +		xe_eudebug_discovery_lock(xe, cmd);
> >>  		ret = drm_compat_ioctl(file, cmd, arg);
> >> +		xe_eudebug_discovery_unlock(xe, cmd);
> >> +	}
> >>  	xe_pm_runtime_put(xe);
> >>  
> >>  	return ret;
> >> diff --git a/drivers/gpu/drm/xe/xe_device.h b/drivers/gpu/drm/xe/xe_device.h
> >> index 4c3f0ebe78a9..b2fc85b1d26e 100644
> >> --- a/drivers/gpu/drm/xe/xe_device.h
> >> +++ b/drivers/gpu/drm/xe/xe_device.h
> >> @@ -7,6 +7,7 @@
> >>  #define _XE_DEVICE_H_
> >>  
> >>  #include <drm/drm_util.h>
> >> +#include <drm/drm_ioctl.h>
> >>  
> >>  #include "xe_device_types.h"
> >>  #include "xe_gt_types.h"
> >> @@ -191,4 +192,37 @@ void xe_device_declare_wedged(struct xe_device *xe);
> >>  struct xe_file *xe_file_get(struct xe_file *xef);
> >>  void xe_file_put(struct xe_file *xef);
> >>  
> >> +#if IS_ENABLED(CONFIG_DRM_XE_EUDEBUG)
> >> +static inline int xe_eudebug_needs_lock(const unsigned int cmd)
> >> +{
> >> +	const unsigned int xe_cmd = DRM_IOCTL_NR(cmd) - DRM_COMMAND_BASE;
> >> +
> >> +	switch (xe_cmd) {
> >> +	case DRM_XE_VM_CREATE:
> >> +	case DRM_XE_VM_DESTROY:
> >> +	case DRM_XE_VM_BIND:
> >> +	case DRM_XE_EXEC_QUEUE_CREATE:
> >> +	case DRM_XE_EXEC_QUEUE_DESTROY:
> >> +	case DRM_XE_EUDEBUG_CONNECT:
> >> +		return 1;
> >> +	}
> >> +
> >> +	return 0;
> >> +}
> >> +
> >> +static inline void xe_eudebug_discovery_lock(struct xe_device *xe, unsigned int cmd)
> >> +{
> >> +	if (xe_eudebug_needs_lock(cmd))
> >> +		down_read(&xe->eudebug.discovery_lock);
> >> +}
> >> +static inline void xe_eudebug_discovery_unlock(struct xe_device *xe, unsigned int cmd)
> >> +{
> >> +	if (xe_eudebug_needs_lock(cmd))
> >> +		up_read(&xe->eudebug.discovery_lock);
> >> +}
> >> +#else
> >> +static inline void xe_eudebug_discovery_lock(struct xe_device *xe, unsigned int cmd) { }
> >> +static inline void xe_eudebug_discovery_unlock(struct xe_device *xe, unsigned int cmd) { }
> >> +#endif /* CONFIG_DRM_XE_EUDEBUG */
> >> +
> >>  #endif
> >> diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h
> >> index cb4b52888a4b..54ceeee7cf75 100644
> >> --- a/drivers/gpu/drm/xe/xe_device_types.h
> >> +++ b/drivers/gpu/drm/xe/xe_device_types.h
> >> @@ -542,6 +542,12 @@ struct xe_device {
> >>  
> >>  		/** @available: is the debugging functionality available */
> >>  		bool available;
> >> +
> >> +		/** @ordered_wq: used to discovery */
> >> +		struct workqueue_struct *ordered_wq;
> >> +
> >> +		/** discovery_lock: used for discovery to block xe ioctls */
> >> +		struct rw_semaphore discovery_lock;
> >>  	} eudebug;
> >>  #endif
> >>  
> >> diff --git a/drivers/gpu/drm/xe/xe_eudebug.c b/drivers/gpu/drm/xe/xe_eudebug.c
> >> index ea0cfd7697aa..c294e2c6152b 100644
> >> --- a/drivers/gpu/drm/xe/xe_eudebug.c
> >> +++ b/drivers/gpu/drm/xe/xe_eudebug.c
> >> @@ -299,6 +299,8 @@ static bool xe_eudebug_detach(struct xe_device *xe,
> >>  	}
> >>  	spin_unlock(&d->connection.lock);
> >>  
> >> +	flush_work(&d->discovery_work);
> >> +
> >>  	if (!detached)
> >>  		return false;
> >>  
> >> @@ -409,7 +411,7 @@ static struct task_struct *find_task_get(struct xe_file *xef)
> >>  }
> >>  
> >>  static struct xe_eudebug *
> >> -xe_eudebug_get(struct xe_file *xef)
> >> +_xe_eudebug_get(struct xe_file *xef)
> >>  {
> >>  	struct task_struct *task;
> >>  	struct xe_eudebug *d;
> >> @@ -433,6 +435,24 @@ xe_eudebug_get(struct xe_file *xef)
> >>  	return d;
> >>  }
> >>  
> >> +static struct xe_eudebug *
> >> +xe_eudebug_get(struct xe_file *xef)
> >> +{
> >> +	struct xe_eudebug *d;
> >> +
> >> +	lockdep_assert_held(&xef->xe->eudebug.discovery_lock);
> >> +
> >> +	d = _xe_eudebug_get(xef);
> >> +	if (d) {
> >> +		if (!completion_done(&d->discovery)) {
> >> +			xe_eudebug_put(d);
> >> +			d = NULL;
> >> +		}
> >> +	}
> >> +
> >> +	return d;
> >> +}
> >> +
> >>  static int xe_eudebug_queue_event(struct xe_eudebug *d,
> >>  				  struct xe_eudebug_event *event)
> >>  {
> >> @@ -810,6 +830,10 @@ static long xe_eudebug_ioctl(struct file *file,
> >>  	struct xe_eudebug * const d = file->private_data;
> >>  	long ret;
> >>  
> >> +	if (cmd != DRM_XE_EUDEBUG_IOCTL_READ_EVENT &&
> >> +	    !completion_done(&d->discovery))
> >> +		return -EBUSY;
> >> +
> >>  	switch (cmd) {
> >>  	case DRM_XE_EUDEBUG_IOCTL_READ_EVENT:
> >>  		ret = xe_eudebug_read_event(d, arg,
> >> @@ -832,6 +856,8 @@ static const struct file_operations fops = {
> >>  	.unlocked_ioctl	= xe_eudebug_ioctl,
> >>  };
> >>  
> >> +static void discovery_work_fn(struct work_struct *work);
> >> +
> >>  static int
> >>  xe_eudebug_connect(struct xe_device *xe,
> >>  		   struct drm_xe_eudebug_connect *param)
> >> @@ -866,9 +892,11 @@ xe_eudebug_connect(struct xe_device *xe,
> >>  	spin_lock_init(&d->connection.lock);
> >>  	init_waitqueue_head(&d->events.write_done);
> >>  	init_waitqueue_head(&d->events.read_done);
> >> +	init_completion(&d->discovery);
> >>  
> >>  	spin_lock_init(&d->events.lock);
> >>  	INIT_KFIFO(d->events.fifo);
> >> +	INIT_WORK(&d->discovery_work, discovery_work_fn);
> >>  
> >>  	d->res = xe_eudebug_resources_alloc();
> >>  	if (IS_ERR(d->res)) {
> >> @@ -886,6 +914,9 @@ xe_eudebug_connect(struct xe_device *xe,
> >>  		goto err_detach;
> >>  	}
> >>  
> >> +	kref_get(&d->ref);
> >> +	queue_work(xe->eudebug.ordered_wq, &d->discovery_work);
> >> +
> >>  	eu_dbg(d, "connected session %lld", d->session);
> >>  
> >>  	return fd;
> >> @@ -918,13 +949,18 @@ void xe_eudebug_init(struct xe_device *xe)
> >>  	spin_lock_init(&xe->eudebug.lock);
> >>  	INIT_LIST_HEAD(&xe->eudebug.list);
> >>  	INIT_LIST_HEAD(&xe->clients.list);
> >> +	init_rwsem(&xe->eudebug.discovery_lock);
> >>  
> >> -	xe->eudebug.available = true;
> >> +	xe->eudebug.ordered_wq = alloc_ordered_workqueue("xe-eudebug-ordered-wq", 0);
> >> +	xe->eudebug.available = !!xe->eudebug.ordered_wq;
> >>  }
> >>  
> >>  void xe_eudebug_fini(struct xe_device *xe)
> >>  {
> >>  	xe_assert(xe, list_empty_careful(&xe->eudebug.list));
> >> +
> >> +	if (xe->eudebug.ordered_wq)
> >> +		destroy_workqueue(xe->eudebug.ordered_wq);
> >>  }
> >>  
> >>  static int send_open_event(struct xe_eudebug *d, u32 flags, const u64 handle,
> >> @@ -990,21 +1026,25 @@ void xe_eudebug_file_open(struct xe_file *xef)
> >>  	struct xe_eudebug *d;
> >>  
> >>  	INIT_LIST_HEAD(&xef->eudebug.client_link);
> >> +
> >> +	down_read(&xef->xe->eudebug.discovery_lock);
> >> +
> >>  	spin_lock(&xef->xe->clients.lock);
> >>  	list_add_tail(&xef->eudebug.client_link, &xef->xe->clients.list);
> >>  	spin_unlock(&xef->xe->clients.lock);
> >>  
> >>  	d = xe_eudebug_get(xef);
> >
> > This looks like this could deadlock.
> >
> > - discovery_work_fn is queued with &d->discovery not complete
> > - This function runs and grabs &xef->xe->eudebug.discovery_lock in read
> >   mode
> > - completion_done(&d->discovery) is waited on xe_eudebug_get
> 
> If the completion is not done we just return NULL immediately.
> 

I misread the code, I was thinking xe_eudebug_get blocked on the
completion. This looks fine then - sorry the confusion noise.

The locking here looks much better than the previous revs.

>From my review PoV - locking:
Acked-by: Matthew Brost <matthew.brost at intel.com>

> Yes, previously when we actually used wait_for_completion() here
> it was a problem. But in this version completion_done() does
> not wait. Looking at completion_done(), the not done path is just
> !READ_ONCE(x->done). Did you mean the taking the spinlock to
> sync against the complete()?
> 
> -Mika
> 
> > - discovery_work_fn can't complete d->discovery) on
> >   &xef->xe->eudebug.discovery_lock in write mode
> >
> > The summary is - it not safe to wait on '&d->discovery' while holding
> > &xef->xe->eudebug.discovery_lock.
> >
> > Matt
> >
> >> -	if (!d)
> >> -		return;
> >> +	if (d)
> >> +		xe_eudebug_event_put(d, client_create_event(d, xef));
> >>  
> >> -	xe_eudebug_event_put(d, client_create_event(d, xef));
> >> +	up_read(&xef->xe->eudebug.discovery_lock);
> >>  }
> >>  
> >>  void xe_eudebug_file_close(struct xe_file *xef)
> >>  {
> >>  	struct xe_eudebug *d;
> >>  
> >> +	down_read(&xef->xe->eudebug.discovery_lock);
> >>  	d = xe_eudebug_get(xef);
> >>  	if (d)
> >>  		xe_eudebug_event_put(d, client_destroy_event(d, xef));
> >> @@ -1012,6 +1052,8 @@ void xe_eudebug_file_close(struct xe_file *xef)
> >>  	spin_lock(&xef->xe->clients.lock);
> >>  	list_del_init(&xef->eudebug.client_link);
> >>  	spin_unlock(&xef->xe->clients.lock);
> >> +
> >> +	up_read(&xef->xe->eudebug.discovery_lock);
> >>  }
> >>  
> >>  static int send_vm_event(struct xe_eudebug *d, u32 flags,
> >> @@ -1112,3 +1154,86 @@ void xe_eudebug_vm_destroy(struct xe_file *xef, struct xe_vm *vm)
> >>  
> >>  	xe_eudebug_event_put(d, vm_destroy_event(d, xef, vm));
> >>  }
> >> +
> >> +static int discover_client(struct xe_eudebug *d, struct xe_file *xef)
> >> +{
> >> +	struct xe_vm *vm;
> >> +	unsigned long i;
> >> +	int err;
> >> +
> >> +	err = client_create_event(d, xef);
> >> +	if (err)
> >> +		return err;
> >> +
> >> +	xa_for_each(&xef->vm.xa, i, vm) {
> >> +		err = vm_create_event(d, xef, vm);
> >> +		if (err)
> >> +			break;
> >> +	}
> >> +
> >> +	return err;
> >> +}
> >> +
> >> +static bool xe_eudebug_task_match(struct xe_eudebug *d, struct xe_file *xef)
> >> +{
> >> +	struct task_struct *task;
> >> +	bool match;
> >> +
> >> +	task = find_task_get(xef);
> >> +	if (!task)
> >> +		return false;
> >> +
> >> +	match = same_thread_group(d->target_task, task);
> >> +
> >> +	put_task_struct(task);
> >> +
> >> +	return match;
> >> +}
> >> +
> >> +static void discover_clients(struct xe_device *xe, struct xe_eudebug *d)
> >> +{
> >> +	struct xe_file *xef;
> >> +	int err;
> >> +
> >> +	list_for_each_entry(xef, &xe->clients.list, eudebug.client_link) {
> >> +		if (xe_eudebug_detached(d))
> >> +			break;
> >> +
> >> +		if (xe_eudebug_task_match(d, xef))
> >> +			err = discover_client(d, xef);
> >> +		else
> >> +			err = 0;
> >> +
> >> +		if (err) {
> >> +			eu_dbg(d, "discover client %p: %d\n", xef, err);
> >> +			break;
> >> +		}
> >> +	}
> >> +}
> >> +
> >> +static void discovery_work_fn(struct work_struct *work)
> >> +{
> >> +	struct xe_eudebug *d = container_of(work, typeof(*d),
> >> +					    discovery_work);
> >> +	struct xe_device *xe = d->xe;
> >> +
> >> +	if (xe_eudebug_detached(d)) {
> >> +		complete_all(&d->discovery);
> >> +		xe_eudebug_put(d);
> >> +		return;
> >> +	}
> >> +
> >> +	down_write(&xe->eudebug.discovery_lock);
> >> +
> >> +	eu_dbg(d, "Discovery start for %lld\n", d->session);
> >> +
> >> +	discover_clients(xe, d);
> >> +
> >> +	eu_dbg(d, "Discovery end for %lld\n", d->session);
> >> +
> >> +	complete_all(&d->discovery);
> >> +
> >> +	up_write(&xe->eudebug.discovery_lock);
> >> +
> >> +	xe_eudebug_put(d);
> >> +}
> >> diff --git a/drivers/gpu/drm/xe/xe_eudebug_types.h b/drivers/gpu/drm/xe/xe_eudebug_types.h
> >> index a5185f18f640..080a821db3e4 100644
> >> --- a/drivers/gpu/drm/xe/xe_eudebug_types.h
> >> +++ b/drivers/gpu/drm/xe/xe_eudebug_types.h
> >> @@ -19,6 +19,7 @@
> >>  struct xe_device;
> >>  struct task_struct;
> >>  struct xe_eudebug_event;
> >> +struct workqueue_struct;
> >>  
> >>  #define CONFIG_DRM_XE_DEBUGGER_EVENT_QUEUE_SIZE 64
> >>  
> >> @@ -96,6 +97,12 @@ struct xe_eudebug {
> >>  	/** @session: session number for this connection (for logs) */
> >>  	u64 session;
> >>  
> >> +	/** @discovery: completion to wait for discovery */
> >> +	struct completion discovery;
> >> +
> >> +	/** @discovery_work: worker to discover resources for target_task */
> >> +	struct work_struct discovery_work;
> >> +
> >>  	/** @events: kfifo queue of to-be-delivered events */
> >>  	struct {
> >>  		/** @lock: guards access to fifo */
> >> -- 
> >> 2.34.1
> >> 


More information about the Intel-xe mailing list