[PATCH 1/1] drm/xe: Store process name and pid in xe file

Tue Jul 23 10:07:04 UTC 2024

On 23/07/2024 05:24, Matthew Brost wrote:
> An xe file can outlive the associated process as the GPU cleanup is just
> triggered upon file close (process kill) and completes sometime later.
> If the file close triggers error conditions (GPU hangs) the process
> cannot be safely referenced to retrieve the name and pid for debug
> information. Store the process name and pid directly in the xe file to
> be safe.
> 
> Signed-off-by: Matthew Brost <matthew.brost at intel.com>

Also if you look at drm_file_update_pid(), things look pretty scary, so 
this sounds very sensible to me.

> ---
>   drivers/gpu/drm/xe/xe_devcoredump.c  | 10 ++--------
>   drivers/gpu/drm/xe/xe_device.c       |  9 +++++++++
>   drivers/gpu/drm/xe/xe_device_types.h | 12 ++++++++++++
>   drivers/gpu/drm/xe/xe_guc_submit.c   | 10 ++--------
>   4 files changed, 25 insertions(+), 16 deletions(-)
> 
> diff --git a/drivers/gpu/drm/xe/xe_devcoredump.c b/drivers/gpu/drm/xe/xe_devcoredump.c
> index 62c2b10fbf1d..d8d8ca2c19d3 100644
> --- a/drivers/gpu/drm/xe/xe_devcoredump.c
> +++ b/drivers/gpu/drm/xe/xe_devcoredump.c
> @@ -171,7 +171,6 @@ static void devcoredump_snapshot(struct xe_devcoredump *coredump,
>   	u32 adj_logical_mask = q->logical_mask;
>   	u32 width_mask = (0x1 << q->width) - 1;
>   	const char *process_name = "no process";
> -	struct task_struct *task = NULL;
>   
>   	int i;
>   	bool cookie;
> @@ -179,14 +178,9 @@ static void devcoredump_snapshot(struct xe_devcoredump *coredump,
>   	ss->snapshot_time = ktime_get_real();
>   	ss->boot_time = ktime_get_boottime();
>   
> -	if (q->vm && q->vm->xef) {
> -		task = get_pid_task(q->vm->xef->drm->pid, PIDTYPE_PID);
> -		if (task)
> -			process_name = task->comm;
> -	}
> +	if (q->vm && q->vm->xef)
> +		process_name = q->vm->xef->process_name;
>   	strscpy(ss->process_name, process_name);
> -	if (task)
> -		put_task_struct(task);
>   
>   	ss->gt = q->gt;
>   	INIT_WORK(&ss->work, xe_devcoredump_deferred_snap_work);
> diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c
> index b677608eb592..5a7b66703aa1 100644
> --- a/drivers/gpu/drm/xe/xe_device.c
> +++ b/drivers/gpu/drm/xe/xe_device.c
> @@ -64,6 +64,7 @@ static int xe_file_open(struct drm_device *dev, struct drm_file *file)
>   	struct xe_drm_client *client;
>   	struct xe_file *xef;
>   	int ret = -ENOMEM;
> +	struct task_struct *task = NULL;
>   
>   	xef = kzalloc(sizeof(*xef), GFP_KERNEL);
>   	if (!xef)
> @@ -92,6 +93,13 @@ static int xe_file_open(struct drm_device *dev, struct drm_file *file)
>   	file->driver_priv = xef;
>   	kref_init(&xef->refcount);
>   
> +	task = get_pid_task(file->pid, PIDTYPE_PID);

We should probably access file->pid with rcu_access_pointer() here. In 
practice it shouldn't really matter here, but the pointer is annotated 
with __rcu so we should respect that.

Otherwise,
Reviewed-by: Matthew Auld <matthew.auld at intel.com>

> +	if (task) {
> +		xef->process_name = kstrdup(task->comm, GFP_KERNEL);
> +		xef->pid = task->pid;
> +		put_task_struct(task);
> +	}
> +
>   	return 0;
>   }
>   
> @@ -110,6 +118,7 @@ static void xe_file_destroy(struct kref *ref)
>   	spin_unlock(&xe->clients.lock);
>   
>   	xe_drm_client_put(xef->client);
> +	kfree(xef->process_name);
>   	kfree(xef);
>   }
>   
> diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h
> index 36252d5b1663..5b7292a9a66d 100644
> --- a/drivers/gpu/drm/xe/xe_device_types.h
> +++ b/drivers/gpu/drm/xe/xe_device_types.h
> @@ -582,6 +582,18 @@ struct xe_file {
>   	/** @client: drm client */
>   	struct xe_drm_client *client;
>   
> +	/**
> +	 * @process_name: process name for file handle, used to safely output
> +	 * during error situations where xe file can outlive process
> +	 */
> +	char *process_name;
> +
> +	/**
> +	 * @pid: pid for file handle, used to safely output uring error
> +	 * situations where xe file can outlive process
> +	 */
> +	pid_t pid;
> +
>   	/** @refcount: ref count of this xe file */
>   	struct kref refcount;
>   };
> diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c
> index da2ead86b9ae..a4570631926f 100644
> --- a/drivers/gpu/drm/xe/xe_guc_submit.c
> +++ b/drivers/gpu/drm/xe/xe_guc_submit.c
> @@ -1072,7 +1072,6 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
>   	struct xe_gpu_scheduler *sched = &q->guc->sched;
>   	struct xe_guc *guc = exec_queue_to_guc(q);
>   	const char *process_name = "no process";
> -	struct task_struct *task = NULL;
>   	int err = -ETIME;
>   	pid_t pid = -1;
>   	int i = 0;
> @@ -1172,17 +1171,12 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
>   	}
>   
>   	if (q->vm && q->vm->xef) {
> -		task = get_pid_task(q->vm->xef->drm->pid, PIDTYPE_PID);
> -		if (task) {
> -			process_name = task->comm;
> -			pid = task->pid;
> -		}
> +		process_name = q->vm->xef->process_name;
> +		pid = q->vm->xef->pid;
>   	}
>   	xe_gt_notice(guc_to_gt(guc), "Timedout job: seqno=%u, lrc_seqno=%u, guc_id=%d, flags=0x%lx in %s [%d]",
>   		     xe_sched_job_seqno(job), xe_sched_job_lrc_seqno(job),
>   		     q->guc->id, q->flags, process_name, pid);
> -	if (task)
> -		put_task_struct(task);
>   
>   	trace_xe_sched_job_timedout(job);
>