[Intel-xe] [RFC 1/2] drm/xe/eudebug: Introduce eudebug support
Andi Shyti
andi.shyti at linux.intel.com
Thu Jun 8 14:45:15 UTC 2023
Hi Mika,
On Tue, May 02, 2023 at 01:35:49PM +0300, Mika Kuoppala wrote:
> With eudebug event interface, user space debugger process (like gdb)
> is able to keep track of resources created by another process
> (debuggee using drm/xe) and act upon these resources.
>
> For example, debugger can find a client vm which contains isa/elf
> for a particular shader/eu-kernel and then inspect and modify it
> (for example installing a breakpoint).
>
> Debugger first opens a connection to xe with a drm ioctl specifying
> target pid to connect. This returns an anon fd handle that can then be
> used to listen for events with dedicated ioctl.
>
> This patch introduces eudebug connection and event queuing, adding
> client create/destroy and vm create/destroy events as a baseline.
> More events for full debugger operation are needed and
> those will be introduced in follow up patches.
>
> The resource tracking parts are inspired by the work of
> Maciej Patelczyk on resource handling for i915. Chris Wilson
> suggested improvement of two ways mapping which makes it easy to
> use resource map as a definitive bookkeep of what resources
> are played to debugger in the discovery phase (on follow up patch).
>
> v2: - event printer removed (Maarten)
> - trim down kfifo accessors (Maarten)
> - xa_alloc spurious locking removed (Maarten)
>
> Cc: Maarten Lankhorst <maarten.lankhorst at linux.intel.com>
> Cc: Lucas De Marchi <lucas.demarchi at intel.com>
> Cc: Maciej Patelczyk <maciej.patelczyk at intel.com>
> Cc: Dominik Grzegorzek <dominik.grzegorzek at intel.com>
> Signed-off-by: Mika Kuoppala <mika.kuoppala at linux.intel.com>
please keep me in the loop.
[...]
> static int xe_file_open(struct drm_device *dev, struct drm_file *file)
> {
> + struct xe_device *xe = to_xe_device(dev);
> struct xe_file *xef;
> + int err;
>
> xef = kzalloc(sizeof(*xef), GFP_KERNEL);
> if (!xef)
> @@ -50,7 +54,15 @@ static int xe_file_open(struct drm_device *dev, struct drm_file *file)
> xa_init_flags(&xef->engine.xa, XA_FLAGS_ALLOC1);
>
> file->driver_priv = xef;
> - return 0;
> +
> + err = xa_alloc(&xe->clients.xa, &xef->client_id, xef, xa_limit_32b, GFP_KERNEL);
> +
> + if (!err)
> + xe_eudebug_file_open(xef);
> + else
> + kfree(xef);
> +
> + return err;
this is an unusual error route. Normally it would be:
err = xa_alloc(&xe->clients.xa, &xef->client_id, xef, xa_limit_32b, GFP_KERNEL);
if (err) {
kfree(xef);
return err;
}
xe_eudebug_file_open(xef);
return err;
Your choice.
> }
>
> static void device_kill_persistent_engines(struct xe_device *xe,
> @@ -79,6 +91,12 @@ static void xe_file_close(struct drm_device *dev, struct drm_file *file)
> mutex_unlock(&xef->vm.lock);
> mutex_destroy(&xef->vm.lock);
>
> + xe_eudebug_file_close(xef);
> +
> + mutex_lock(&xe->clients.lock);
> + xa_erase(&xe->clients.xa, xef->client_id);
> + mutex_unlock(&xe->clients.lock);
do we need to lock here? xa_erase() is protected by spinlocks.
> +
> kfree(xef);
> }
[...]
> /** @d3cold_allowed: Indicates if d3cold is a valid device state */
> bool d3cold_allowed;
>
> + /** @debugger connection list and globals for device */
> + struct {
> + /** @lock: protects the list of connections */
> + spinlock_t lock;
> + /** @list: list of connections, aka debuggers */
> + struct list_head list;
> +
> + /** @session_count: session counter to track connections */
> + u64 session_count;
> +
> + /** @available: is the debugging functionality available */
> + bool available;
> + } eudebug;
> +
> + /** @clients xe_file tracking for eudebug discovery */
> + struct {
> + /** @lock: protects the xa */
> + struct mutex lock;
xa should already have its own spinlock. Do we really need a
mutex here?
> + /** @xa: xarray of xe_files currently open */
> + struct xarray xa;
> + } clients;
> +
> /* private: */
>
[...]
> +/*
> + * This is racy as we dont take the lock for read but all the
> + * callsites can handle the race so we can live without lock.
> + */
> +__no_kcsan
> +static unsigned int
> +event_fifo_num_events_peek(const struct xe_eudebug * const d)
can you please use "locked" suffix to make it clear?
> +{
> + return kfifo_len(&d->events.fifo);
> +}
[...]
> +static struct xe_eudebug_resources *
> +xe_eudebug_resources_alloc(void)
> +{
> + struct xe_eudebug_resources *res;
> + int err;
> + int i;
> +
> + res = kzalloc(sizeof(*res), GFP_KERNEL);
> + if (!res)
> + return NULL;
> +
> + mutex_init(&res->lock);
> +
> + for (i = 0; i < XE_EUDEBUG_RES_TYPE_COUNT; i++) {
> + xa_init_flags(&res->rt[i].xa, XA_FLAGS_ALLOC1);
> + err = rhashtable_init(&res->rt[i].rh, &rhash_res);
> +
> + if (err) {
> + while (i--) {
> + xa_destroy(&res->rt[i].xa);
> + rhashtable_destroy(&res->rt[i].rh);
> + }
> +
> + goto out;
> + }
> + }
> +
> +out:
> + if (err) {
> + kfree(res);
> + res = NULL;
> + }
you could put this inside the "if (err)" above and return NULL
directly there without needing any extra "out:" label.
Or you could put everything outside here.
> +
> + return res;
> +}
> +
> +static void res_free_fn(void *ptr, void *arg)
> +{
> + XE_WARN_ON(ptr);
> + kfree(ptr);
> +}
> +
> +static void
> +xe_eudebug_resources_free(struct xe_eudebug *d)
> +{
> + struct xe_eudebug_resources *res = d->res;
> + struct xe_eudebug_handle *h;
> + unsigned long i, j;
> + int err;
> +
> + mutex_lock(&res->lock);
> + for (i = 0; i < XE_EUDEBUG_RES_TYPE_COUNT; i++) {
> + struct xe_eudebug_resource *r = &res->rt[i];
> +
> + xa_for_each(&r->xa, j, h) {
> + struct xe_eudebug_handle *t;
> +
> + err = rhashtable_remove_fast(&r->rh,
> + &h->rh_head,
> + rhash_res);
> + XE_WARN_ON(err);
> + t = xa_erase(&r->xa, h->id);
> + XE_WARN_ON(t != h);
> + kfree(t);
a bit difficult sometimes to follow with variables called t, h,
d, r... maybe a more meaningful name sould help.
> + }
> + }
> + mutex_unlock(&res->lock);
> +
> + for (i = 0; i < XE_EUDEBUG_RES_TYPE_COUNT; i++) {
> + struct xe_eudebug_resource *r = &res->rt[i];
> +
> + rhashtable_free_and_destroy(&r->rh, res_free_fn, NULL);
> + XE_WARN_ON(!xa_empty(&r->xa));
> + xa_destroy(&r->xa);
> + }
> +
> + mutex_destroy(&res->lock);
> +
> + kfree(res);
> +}
[...]
> +static struct xe_eudebug *
> +xe_eudebug_get(struct xe_file *xef)
> +{
> + struct xe_device *xe = xef_to_xe(xef);
> + struct task_struct *task;
> + struct xe_eudebug *d = NULL;
> +
> + task = find_task_get(xef->drm->pid);
if (!task)
return NULL;
and you don't need to initialize and the rest gets also easier.
> + if (task) {
> + d = xe_eudebug_for_task_get(xe, task);
> + put_task_struct(task);
> + }
> +
> + if (d && xe_eudebug_detached(d)) {
> + xe_eudebug_put(d);
> + d = NULL;
> + }
> +
> + return d;
> +}
[...]
> +static int _xe_eudebug_queue_event(struct xe_eudebug *d,
> + struct xe_eudebug_event *event,
> + gfp_t gfp)
> +{
> + u64 start_t;
> + int ret;
> +
> + XE_BUG_ON(event->size <= sizeof(struct xe_eudebug_event));
> + XE_BUG_ON(!event->type);
> + XE_BUG_ON(event->type == DRM_XE_EUDEBUG_EVENT_READ);
> +
> + ret = queue_event(d, &event);
> + if (!ret)
> + return 0;
> +
> + start_t = ktime_get();
> +
> + while (ret == -ENOSPC) {
> + struct xe_eudebug_event *blocking;
> +
> + ret = queue_event(d, &event);
> + if (ret != -ENOSPC)
> + break;
> +
> + blocking = event_fifo_pending(d);
> +
> + msleep(1 + 1 * event_fifo_num_events_peek(d));
as this parametric, you could use fsleep().
> + /* restart timeout if we see progress on fifo */
> + if (blocking && blocking != event_fifo_pending(d))
> + start_t = ktime_get();
> +
> + if (ktime_ms_delta(ktime_get(), start_t) >=
> + XE_EUDEBUG_NO_READ_DETECTED_TIMEOUT_MS)
> + ret = -ETIMEDOUT;
> + }
> +
> + if (ret) {
> + eu_warn(d, "event %llu queue failed (blocked %lld ms), disconnecting with %d",
> + event ? event->seqno : 0,
> + ktime_ms_delta(ktime_get(), start_t),
> + ret);
> + xe_eudebug_disconnect(d, ret);
> + }
> +
> + kfree(event);
> +
> + return ret;
> +}
> +
> +static int xe_eudebug_queue_event(struct xe_eudebug *d,
> + struct xe_eudebug_event *event)
> +{
> + return _xe_eudebug_queue_event(d, event, GFP_KERNEL);
> +}
> +
> +static struct xe_eudebug_handle *
> +alloc_handle(const int type, const void * const key)
please, drop the const, it's not const at the caller and you
"lose" it at the cast.
> +{
> + struct xe_eudebug_handle *h;
> +
> + h = kzalloc(sizeof(*h), GFP_KERNEL);
> + if (!h)
> + return NULL;
> +
> + h->key = (u64)key;
> +
> + return h;
> +}
[...]
> +static int xe_eudebug_add_handle(struct xe_eudebug *d,
> + int type,
> + void *p)
> +{
> + struct xe_eudebug_resource *r;
> + struct xe_eudebug_handle *h;
> + int err;
> +
> + if (xe_eudebug_detached(d))
> + return -ENOTCONN;
where did you fish enotconn? Isn't this for networks? Maybe
einval would be more appropriate?
> +
> + h = alloc_handle(type, p);
> + if (!h)
> + return -ENOMEM;
[...]
> + spin_lock(&d->events.lock);
> + event = event_fifo_pending(d);
> + if (event) {
> + if (user_event.size < event->size) {
> + ret = -EMSGSIZE;
-EINVAL?
> + } else if (!access_ok(user_orig, event->size)) {
> + ret = -EFAULT;
> + } else if (!kfifo_get(&d->events.fifo, &event)) {
> + eu_warn(d, "internal fifo corruption");
> + ret = -ENOTCONN;
> + } else {
> + ret = 0;
> + }
[...]
> +int xe_eudebug_connect_ioctl(struct drm_device *dev,
> + void *data,
> + struct drm_file *file)
> +{
> + struct xe_device *xe = to_xe_device(dev);
> + struct drm_xe_eudebug_connect_param * const param = data;
> + int ret = 0;
> +
> + ret = xe_eudebug_connect(xe, param);
> +
> + return ret;
just return xe_eudebug....()
Quite a bit to digest... it was just a quick look.
Andi
More information about the Intel-xe
mailing list