[Intel-xe] [RFC 1/2] drm/xe/eudebug: Introduce eudebug support

Thu Jun 8 14:45:15 UTC 2023

Hi Mika,

On Tue, May 02, 2023 at 01:35:49PM +0300, Mika Kuoppala wrote:
> With eudebug event interface, user space debugger process (like gdb)
> is able to keep track of resources created by another process
> (debuggee using drm/xe) and act upon these resources.
> 
> For example, debugger can find a client vm which contains isa/elf
> for a particular shader/eu-kernel and then inspect and modify it
> (for example installing a breakpoint).
> 
> Debugger first opens a connection to xe with a drm ioctl specifying
> target pid to connect. This returns an anon fd handle that can then be
> used to listen for events with dedicated ioctl.
> 
> This patch introduces eudebug connection and event queuing, adding
> client create/destroy and vm create/destroy events as a baseline.
> More events for full debugger operation are needed and
> those will be introduced in follow up patches.
> 
> The resource tracking parts are inspired by the work of
> Maciej Patelczyk on resource handling for i915. Chris Wilson
> suggested improvement of two ways mapping which makes it easy to
> use resource map as a definitive bookkeep of what resources
> are played to debugger in the discovery phase (on follow up patch).
> 
> v2: - event printer removed (Maarten)
>     - trim down kfifo accessors (Maarten)
>     - xa_alloc spurious locking removed (Maarten)
> 
> Cc: Maarten Lankhorst <maarten.lankhorst at linux.intel.com>
> Cc: Lucas De Marchi <lucas.demarchi at intel.com>
> Cc: Maciej Patelczyk <maciej.patelczyk at intel.com>
> Cc: Dominik Grzegorzek <dominik.grzegorzek at intel.com>
> Signed-off-by: Mika Kuoppala <mika.kuoppala at linux.intel.com>

please keep me in the loop.

[...]

>  static int xe_file_open(struct drm_device *dev, struct drm_file *file)
>  {
> +	struct xe_device *xe = to_xe_device(dev);
>  	struct xe_file *xef;
> +	int err;
>  
>  	xef = kzalloc(sizeof(*xef), GFP_KERNEL);
>  	if (!xef)
> @@ -50,7 +54,15 @@ static int xe_file_open(struct drm_device *dev, struct drm_file *file)
>  	xa_init_flags(&xef->engine.xa, XA_FLAGS_ALLOC1);
>  
>  	file->driver_priv = xef;
> -	return 0;
> +
> +	err = xa_alloc(&xe->clients.xa, &xef->client_id, xef, xa_limit_32b, GFP_KERNEL);
> +
> +	if (!err)
> +		xe_eudebug_file_open(xef);
> +	else
> +		kfree(xef);
> +
> +	return err;

this is an unusual error route. Normally it would be:

	err = xa_alloc(&xe->clients.xa, &xef->client_id, xef, xa_limit_32b, GFP_KERNEL);
	if (err) {
		kfree(xef);
		return err;
	}

	xe_eudebug_file_open(xef);

	return err;

Your choice.

>  }
>  
>  static void device_kill_persistent_engines(struct xe_device *xe,
> @@ -79,6 +91,12 @@ static void xe_file_close(struct drm_device *dev, struct drm_file *file)
>  	mutex_unlock(&xef->vm.lock);
>  	mutex_destroy(&xef->vm.lock);
>  
> +	xe_eudebug_file_close(xef);
> +
> +	mutex_lock(&xe->clients.lock);
> +	xa_erase(&xe->clients.xa, xef->client_id);
> +	mutex_unlock(&xe->clients.lock);

do we need to lock here? xa_erase() is protected by spinlocks.

> +
>  	kfree(xef);
>  }

[...]

>  	/** @d3cold_allowed: Indicates if d3cold is a valid device state */
>  	bool d3cold_allowed;
>  
> +	/** @debugger connection list and globals for device */
> +	struct {
> +		/** @lock: protects the list of connections */
> +		spinlock_t lock;
> +		/** @list: list of connections, aka debuggers */
> +		struct list_head list;
> +
> +		/** @session_count: session counter to track connections */
> +		u64 session_count;
> +
> +		/** @available: is the debugging functionality available */
> +		bool available;
> +	} eudebug;
> +
> +	/** @clients xe_file tracking for eudebug discovery */
> +	struct {
> +		/** @lock: protects the xa */
> +		struct mutex lock;

xa should already have its own spinlock. Do we really need a
mutex here?

> +		/** @xa: xarray of xe_files currently open */
> +		struct xarray xa;
> +	} clients;
> +
>  	/* private: */
>  

[...]

> +/*
> + * This is racy as we dont take the lock for read but all the
> + * callsites can handle the race so we can live without lock.
> + */
> +__no_kcsan
> +static unsigned int
> +event_fifo_num_events_peek(const struct xe_eudebug * const d)

can you please use "locked" suffix to make it clear?

> +{
> +	return kfifo_len(&d->events.fifo);
> +}

[...]

> +static struct xe_eudebug_resources *
> +xe_eudebug_resources_alloc(void)
> +{
> +	struct xe_eudebug_resources *res;
> +	int err;
> +	int i;
> +
> +	res = kzalloc(sizeof(*res), GFP_KERNEL);
> +	if (!res)
> +		return NULL;
> +
> +	mutex_init(&res->lock);
> +
> +	for (i = 0; i < XE_EUDEBUG_RES_TYPE_COUNT; i++) {
> +		xa_init_flags(&res->rt[i].xa, XA_FLAGS_ALLOC1);
> +		err = rhashtable_init(&res->rt[i].rh, &rhash_res);
> +
> +		if (err) {
> +			while (i--) {
> +				xa_destroy(&res->rt[i].xa);
> +				rhashtable_destroy(&res->rt[i].rh);
> +			}
> +
> +			goto out;
> +		}
> +	}
> +
> +out:
> +	if (err) {
> +		kfree(res);
> +		res = NULL;
> +	}

you could put this inside the "if (err)" above and return NULL
directly there without needing any extra "out:" label.

Or you could put everything outside here.

> +
> +	return res;
> +}
> +
> +static void res_free_fn(void *ptr, void *arg)
> +{
> +	XE_WARN_ON(ptr);
> +	kfree(ptr);
> +}
> +
> +static void
> +xe_eudebug_resources_free(struct xe_eudebug *d)
> +{
> +	struct xe_eudebug_resources *res = d->res;
> +	struct xe_eudebug_handle *h;
> +	unsigned long i, j;
> +	int err;
> +
> +	mutex_lock(&res->lock);
> +	for (i = 0; i < XE_EUDEBUG_RES_TYPE_COUNT; i++) {
> +		struct xe_eudebug_resource *r = &res->rt[i];
> +
> +		xa_for_each(&r->xa, j, h) {
> +			struct xe_eudebug_handle *t;
> +
> +			err = rhashtable_remove_fast(&r->rh,
> +						     &h->rh_head,
> +						     rhash_res);
> +			XE_WARN_ON(err);
> +			t = xa_erase(&r->xa, h->id);
> +			XE_WARN_ON(t != h);
> +			kfree(t);

a bit difficult sometimes to follow with variables called t, h,
d, r... maybe a more meaningful name sould help.

> +		}
> +	}
> +	mutex_unlock(&res->lock);
> +
> +	for (i = 0; i < XE_EUDEBUG_RES_TYPE_COUNT; i++) {
> +		struct xe_eudebug_resource *r = &res->rt[i];
> +
> +		rhashtable_free_and_destroy(&r->rh, res_free_fn, NULL);
> +		XE_WARN_ON(!xa_empty(&r->xa));
> +		xa_destroy(&r->xa);
> +	}
> +
> +	mutex_destroy(&res->lock);
> +
> +	kfree(res);
> +}

[...]

> +static struct xe_eudebug *
> +xe_eudebug_get(struct xe_file *xef)
> +{
> +	struct xe_device *xe = xef_to_xe(xef);
> +	struct task_struct *task;
> +	struct xe_eudebug *d = NULL;
> +
> +	task = find_task_get(xef->drm->pid);

   if (!task)
   	return NULL;

and you don't need to initialize and the rest gets also easier.

> +	if (task) {
> +		d = xe_eudebug_for_task_get(xe, task);
> +		put_task_struct(task);
> +	}
> +
> +	if (d && xe_eudebug_detached(d)) {
> +		xe_eudebug_put(d);
> +		d = NULL;
> +	}
> +
> +	return d;
> +}

[...]

> +static int _xe_eudebug_queue_event(struct xe_eudebug *d,
> +				   struct xe_eudebug_event *event,
> +				   gfp_t gfp)
> +{
> +	u64 start_t;
> +	int ret;
> +
> +	XE_BUG_ON(event->size <= sizeof(struct xe_eudebug_event));
> +	XE_BUG_ON(!event->type);
> +	XE_BUG_ON(event->type == DRM_XE_EUDEBUG_EVENT_READ);
> +
> +	ret = queue_event(d, &event);
> +	if (!ret)
> +		return 0;
> +
> +	start_t = ktime_get();
> +
> +	while (ret == -ENOSPC) {
> +		struct xe_eudebug_event *blocking;
> +
> +		ret = queue_event(d, &event);
> +		if (ret != -ENOSPC)
> +			break;
> +
> +		blocking = event_fifo_pending(d);
> +
> +		msleep(1 + 1 * event_fifo_num_events_peek(d));

as this parametric, you could use fsleep().

> +		/* restart timeout if we see progress on fifo */
> +		if (blocking && blocking != event_fifo_pending(d))
> +			start_t = ktime_get();
> +
> +		if (ktime_ms_delta(ktime_get(), start_t) >=
> +		    XE_EUDEBUG_NO_READ_DETECTED_TIMEOUT_MS)
> +			ret = -ETIMEDOUT;
> +	}
> +
> +	if (ret) {
> +		eu_warn(d, "event %llu queue failed (blocked %lld ms), disconnecting with %d",
> +			event ? event->seqno : 0,
> +			ktime_ms_delta(ktime_get(), start_t),
> +			ret);
> +		xe_eudebug_disconnect(d, ret);
> +	}
> +
> +	kfree(event);
> +
> +	return ret;
> +}
> +
> +static int xe_eudebug_queue_event(struct xe_eudebug *d,
> +				  struct xe_eudebug_event *event)
> +{
> +	return _xe_eudebug_queue_event(d, event, GFP_KERNEL);
> +}
> +
> +static struct xe_eudebug_handle *
> +alloc_handle(const int type, const void * const key)

please, drop the const, it's not const at the caller and you
"lose" it at the cast.

> +{
> +	struct xe_eudebug_handle *h;
> +
> +	h = kzalloc(sizeof(*h), GFP_KERNEL);
> +	if (!h)
> +		return NULL;
> +
> +	h->key = (u64)key;
> +
> +	return h;
> +}

[...]

> +static int xe_eudebug_add_handle(struct xe_eudebug *d,
> +				 int type,
> +				 void *p)
> +{
> +	struct xe_eudebug_resource *r;
> +	struct xe_eudebug_handle *h;
> +	int err;
> +
> +	if (xe_eudebug_detached(d))
> +		return -ENOTCONN;

where did you fish enotconn? Isn't this for networks? Maybe
einval would be more appropriate?

> +
> +	h = alloc_handle(type, p);
> +	if (!h)
> +		return -ENOMEM;

[...]

> +	spin_lock(&d->events.lock);
> +	event = event_fifo_pending(d);
> +	if (event) {
> +		if (user_event.size < event->size) {
> +			ret = -EMSGSIZE;

-EINVAL?

> +		} else if (!access_ok(user_orig, event->size)) {
> +			ret = -EFAULT;
> +		} else if (!kfifo_get(&d->events.fifo, &event)) {
> +			eu_warn(d, "internal fifo corruption");
> +			ret = -ENOTCONN;
> +		} else {
> +			ret = 0;
> +		}

[...]

> +int xe_eudebug_connect_ioctl(struct drm_device *dev,
> +			     void *data,
> +			     struct drm_file *file)
> +{
> +	struct xe_device *xe = to_xe_device(dev);
> +	struct drm_xe_eudebug_connect_param * const param = data;
> +	int ret = 0;
> +
> +	ret = xe_eudebug_connect(xe, param);
> +
> +	return ret;

just return xe_eudebug....()

Quite a bit to digest... it was just a quick look.

Andi