[PATCH 2/2] drm/panthor: Expose scheduler groups info in debugfs

Mon Sep 23 09:51:47 UTC 2024

On 18/09/2024 09:50, Mary Guillemard wrote:
> This adds a new debugfs file named "sched_groups" allowing a user to
> query information about all userspace clients scheduler groups.
> 
> As we uses drm_device.filelist to achieve it, we also expose the
> client_id with the task information to differentiate one client
> from another inside the same task.
> 
> Signed-off-by: Mary Guillemard <mary.guillemard at collabora.com>

Can you explain a bit more why you want this? I'm not against the idea
but I'm wary of adding 'debugging features' because often they fall into
one of two camps:

 * Genuinely useful features that shouldn't be in debugfs because people
want a stable API.

 * Niche features that take maintainance effort despite not being
particularly useful.

> ---
>  drivers/gpu/drm/panthor/panthor_drv.c   |   1 +
>  drivers/gpu/drm/panthor/panthor_sched.c | 127 ++++++++++++++++++++++++
>  drivers/gpu/drm/panthor/panthor_sched.h |   4 +
>  3 files changed, 132 insertions(+)
> 
> diff --git a/drivers/gpu/drm/panthor/panthor_drv.c b/drivers/gpu/drm/panthor/panthor_drv.c
> index 0d825d63d712..aa390597d355 100644
> --- a/drivers/gpu/drm/panthor/panthor_drv.c
> +++ b/drivers/gpu/drm/panthor/panthor_drv.c
> @@ -1450,6 +1450,7 @@ static const struct file_operations panthor_drm_driver_fops = {
>  static void panthor_debugfs_init(struct drm_minor *minor)
>  {
>  	panthor_mmu_debugfs_init(minor);
> +	panthor_sched_debugfs_init(minor);
>  }
>  #endif
>  
> diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c
> index f15abeef4ece..0c1ddbfbe164 100644
> --- a/drivers/gpu/drm/panthor/panthor_sched.c
> +++ b/drivers/gpu/drm/panthor/panthor_sched.c
> @@ -1,6 +1,7 @@
>  // SPDX-License-Identifier: GPL-2.0 or MIT
>  /* Copyright 2023 Collabora ltd. */
>  
> +#include <drm/drm_debugfs.h>
>  #include <drm/drm_drv.h>
>  #include <drm/drm_exec.h>
>  #include <drm/drm_gem_shmem_helper.h>
> @@ -3581,3 +3582,129 @@ int panthor_sched_init(struct panthor_device *ptdev)
>  	ptdev->scheduler = sched;
>  	return 0;
>  }
> +
> +#ifdef CONFIG_DEBUG_FS
> +static const char *panthor_csg_priority_names[PANTHOR_CSG_PRIORITY_COUNT] = {
> +	"LOW",
> +	"MEDIUM",
> +	"HIGH",
> +	"REALTIME"
> +};
> +
> +static const char *panthor_group_state_names[PANTHOR_CS_GROUP_UNKNOWN_STATE] = {
> +	"CREATED",
> +	"ACTIVE",
> +	"SUSPENDED",
> +	"TERMINATED"
> +};
> +
> +static void show_panthor_queue(const struct panthor_queue *queue,
> +			       u32 queue_index,
> +			       struct seq_file *m)
> +{
> +	seq_printf(m, "queue %u:", queue_index);
> +	seq_printf(m, " priority %u", queue->priority);
> +	seq_printf(m, " doorbell_id %d", queue->doorbell_id);
> +	seq_puts(m, "\n");
> +}
> +
> +static void show_panthor_group(struct panthor_group *group,
> +			       u32 group_handle,
> +			       struct seq_file *m)
> +{
> +	u32 i;
> +
> +	group_get(group);

This looks wrong - this function relies on group not becoming invalid
before the call, so we'd better be holding a sufficient lock/reference
on the group before show_panthor_group() is called otherwise we've got a
race before the group_get() call.

Although I can see this bug is present elsewhere in panthor_sched.c now
I look...

> +
> +	seq_printf(m, "group %u:", group_handle);
> +	seq_printf(m, " priority %s", group->priority < PANTHOR_CSG_PRIORITY_COUNT ?
> +		   panthor_csg_priority_names[group->priority] : "UNKNOWN");
> +	seq_printf(m, " state %s", group->state < PANTHOR_CS_GROUP_UNKNOWN_STATE ?
> +		   panthor_group_state_names[group->state] : "UNKNOWN");
> +	seq_printf(m, " csg_id %d", group->csg_id);
> +	seq_printf(m, " csg_priority %d", group->csg_priority);
> +	seq_printf(m, " compute_core_mask 0x%016llx", group->compute_core_mask);
> +	seq_printf(m, " fragment_core_mask 0x%016llx", group->fragment_core_mask);
> +	seq_printf(m, " tiler_core_mask 0x%016llx", group->tiler_core_mask);
> +	seq_printf(m, " max_compute_cores %d", group->max_compute_cores);
> +	seq_printf(m, " max_fragment_cores %d", group->max_fragment_cores);
> +	seq_printf(m, " max_tiler_cores %d", group->max_tiler_cores);
> +	seq_puts(m, "\n");
> +
> +	for (i = 0; i < group->queue_count; i++)
> +		show_panthor_queue(group->queues[i], i, m);
> +
> +	group_put(group);
> +}
> +
> +static int show_file_group_pool(const struct panthor_file *pfile, struct seq_file *m)
> +{
> +	struct panthor_group_pool *gpool = pfile->groups;
> +	struct panthor_group *group;
> +	unsigned long i;
> +
> +	if (IS_ERR_OR_NULL(gpool))
> +		return 0;
> +
> +	xa_for_each(&gpool->xa, i, group)
> +		show_panthor_group(group, i, m);

Here I see no protection against a racing call to PANTHOR_GROUP_DESTROY,
so group could become invalid between xa_for_each returning it and the
call to show_panthor_group().

Steve

> +
> +	return 0;
> +}
> +
> +static int show_each_file(struct seq_file *m, void *arg)
> +{
> +	struct drm_info_node *node = (struct drm_info_node *)m->private;
> +	struct drm_device *ddev = node->minor->dev;
> +	int (*show)(const struct panthor_file *, struct seq_file *) = node->info_ent->data;
> +	struct drm_file *file;
> +	int ret;
> +
> +	ret = mutex_lock_interruptible(&ddev->filelist_mutex);
> +	if (ret)
> +		return ret;
> +
> +	list_for_each_entry(file, &ddev->filelist, lhead) {
> +		struct task_struct *task;
> +		struct panthor_file *pfile = file->driver_priv;
> +		struct pid *pid;
> +
> +		/*
> +		 * Although we have a valid reference on file->pid, that does
> +		 * not guarantee that the task_struct who called get_pid() is
> +		 * still alive (e.g. get_pid(current) => fork() => exit()).
> +		 * Therefore, we need to protect this ->comm access using RCU.
> +		 */
> +		rcu_read_lock();
> +		pid = rcu_dereference(file->pid);
> +		task = pid_task(pid, PIDTYPE_TGID);
> +		seq_printf(m, "client_id %8llu pid %8d command %s:\n", file->client_id,
> +			   pid_nr(pid), task ? task->comm : "<unknown>");
> +		rcu_read_unlock();
> +
> +		ret = show(pfile, m);
> +		if (ret < 0)
> +			break;
> +
> +		seq_puts(m, "\n");
> +	}
> +
> +	mutex_unlock(&ddev->filelist_mutex);
> +	return ret;
> +}
> +
> +static struct drm_info_list panthor_sched_debugfs_list[] = {
> +	{ "sched_groups", show_each_file, 0, show_file_group_pool },
> +};
> +
> +/**
> + * panthor_sched_debugfs_init() - Initialize scheduler debugfs entries
> + * @minor: Minor.
> + */
> +void panthor_sched_debugfs_init(struct drm_minor *minor)
> +{
> +	drm_debugfs_create_files(panthor_sched_debugfs_list,
> +				 ARRAY_SIZE(panthor_sched_debugfs_list),
> +				 minor->debugfs_root, minor);
> +}
> +#endif /* CONFIG_DEBUG_FS */
> diff --git a/drivers/gpu/drm/panthor/panthor_sched.h b/drivers/gpu/drm/panthor/panthor_sched.h
> index 3a30d2328b30..577239aa66bf 100644
> --- a/drivers/gpu/drm/panthor/panthor_sched.h
> +++ b/drivers/gpu/drm/panthor/panthor_sched.h
> @@ -47,4 +47,8 @@ void panthor_sched_resume(struct panthor_device *ptdev);
>  void panthor_sched_report_mmu_fault(struct panthor_device *ptdev);
>  void panthor_sched_report_fw_events(struct panthor_device *ptdev, u32 events);
>  
> +#ifdef CONFIG_DEBUG_FS
> +void panthor_sched_debugfs_init(struct drm_minor *minor);
> +#endif
> +
>  #endif