[PATCH 1/1] drm/xe: Store process name and pid in xe file
Matthew Auld
matthew.auld at intel.com
Tue Jul 23 10:07:04 UTC 2024
On 23/07/2024 05:24, Matthew Brost wrote:
> An xe file can outlive the associated process as the GPU cleanup is just
> triggered upon file close (process kill) and completes sometime later.
> If the file close triggers error conditions (GPU hangs) the process
> cannot be safely referenced to retrieve the name and pid for debug
> information. Store the process name and pid directly in the xe file to
> be safe.
>
> Signed-off-by: Matthew Brost <matthew.brost at intel.com>
Also if you look at drm_file_update_pid(), things look pretty scary, so
this sounds very sensible to me.
> ---
> drivers/gpu/drm/xe/xe_devcoredump.c | 10 ++--------
> drivers/gpu/drm/xe/xe_device.c | 9 +++++++++
> drivers/gpu/drm/xe/xe_device_types.h | 12 ++++++++++++
> drivers/gpu/drm/xe/xe_guc_submit.c | 10 ++--------
> 4 files changed, 25 insertions(+), 16 deletions(-)
>
> diff --git a/drivers/gpu/drm/xe/xe_devcoredump.c b/drivers/gpu/drm/xe/xe_devcoredump.c
> index 62c2b10fbf1d..d8d8ca2c19d3 100644
> --- a/drivers/gpu/drm/xe/xe_devcoredump.c
> +++ b/drivers/gpu/drm/xe/xe_devcoredump.c
> @@ -171,7 +171,6 @@ static void devcoredump_snapshot(struct xe_devcoredump *coredump,
> u32 adj_logical_mask = q->logical_mask;
> u32 width_mask = (0x1 << q->width) - 1;
> const char *process_name = "no process";
> - struct task_struct *task = NULL;
>
> int i;
> bool cookie;
> @@ -179,14 +178,9 @@ static void devcoredump_snapshot(struct xe_devcoredump *coredump,
> ss->snapshot_time = ktime_get_real();
> ss->boot_time = ktime_get_boottime();
>
> - if (q->vm && q->vm->xef) {
> - task = get_pid_task(q->vm->xef->drm->pid, PIDTYPE_PID);
> - if (task)
> - process_name = task->comm;
> - }
> + if (q->vm && q->vm->xef)
> + process_name = q->vm->xef->process_name;
> strscpy(ss->process_name, process_name);
> - if (task)
> - put_task_struct(task);
>
> ss->gt = q->gt;
> INIT_WORK(&ss->work, xe_devcoredump_deferred_snap_work);
> diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c
> index b677608eb592..5a7b66703aa1 100644
> --- a/drivers/gpu/drm/xe/xe_device.c
> +++ b/drivers/gpu/drm/xe/xe_device.c
> @@ -64,6 +64,7 @@ static int xe_file_open(struct drm_device *dev, struct drm_file *file)
> struct xe_drm_client *client;
> struct xe_file *xef;
> int ret = -ENOMEM;
> + struct task_struct *task = NULL;
>
> xef = kzalloc(sizeof(*xef), GFP_KERNEL);
> if (!xef)
> @@ -92,6 +93,13 @@ static int xe_file_open(struct drm_device *dev, struct drm_file *file)
> file->driver_priv = xef;
> kref_init(&xef->refcount);
>
> + task = get_pid_task(file->pid, PIDTYPE_PID);
We should probably access file->pid with rcu_access_pointer() here. In
practice it shouldn't really matter here, but the pointer is annotated
with __rcu so we should respect that.
Otherwise,
Reviewed-by: Matthew Auld <matthew.auld at intel.com>
> + if (task) {
> + xef->process_name = kstrdup(task->comm, GFP_KERNEL);
> + xef->pid = task->pid;
> + put_task_struct(task);
> + }
> +
> return 0;
> }
>
> @@ -110,6 +118,7 @@ static void xe_file_destroy(struct kref *ref)
> spin_unlock(&xe->clients.lock);
>
> xe_drm_client_put(xef->client);
> + kfree(xef->process_name);
> kfree(xef);
> }
>
> diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h
> index 36252d5b1663..5b7292a9a66d 100644
> --- a/drivers/gpu/drm/xe/xe_device_types.h
> +++ b/drivers/gpu/drm/xe/xe_device_types.h
> @@ -582,6 +582,18 @@ struct xe_file {
> /** @client: drm client */
> struct xe_drm_client *client;
>
> + /**
> + * @process_name: process name for file handle, used to safely output
> + * during error situations where xe file can outlive process
> + */
> + char *process_name;
> +
> + /**
> + * @pid: pid for file handle, used to safely output uring error
> + * situations where xe file can outlive process
> + */
> + pid_t pid;
> +
> /** @refcount: ref count of this xe file */
> struct kref refcount;
> };
> diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c
> index da2ead86b9ae..a4570631926f 100644
> --- a/drivers/gpu/drm/xe/xe_guc_submit.c
> +++ b/drivers/gpu/drm/xe/xe_guc_submit.c
> @@ -1072,7 +1072,6 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
> struct xe_gpu_scheduler *sched = &q->guc->sched;
> struct xe_guc *guc = exec_queue_to_guc(q);
> const char *process_name = "no process";
> - struct task_struct *task = NULL;
> int err = -ETIME;
> pid_t pid = -1;
> int i = 0;
> @@ -1172,17 +1171,12 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
> }
>
> if (q->vm && q->vm->xef) {
> - task = get_pid_task(q->vm->xef->drm->pid, PIDTYPE_PID);
> - if (task) {
> - process_name = task->comm;
> - pid = task->pid;
> - }
> + process_name = q->vm->xef->process_name;
> + pid = q->vm->xef->pid;
> }
> xe_gt_notice(guc_to_gt(guc), "Timedout job: seqno=%u, lrc_seqno=%u, guc_id=%d, flags=0x%lx in %s [%d]",
> xe_sched_job_seqno(job), xe_sched_job_lrc_seqno(job),
> q->guc->id, q->flags, process_name, pid);
> - if (task)
> - put_task_struct(task);
>
> trace_xe_sched_job_timedout(job);
>
More information about the Intel-xe
mailing list