[PATCH 1/2] drm/panthor: Enable fdinfo for cycle and time measurements

Tue Apr 23 15:51:44 UTC 2024

Hi Liviu,

Thanks for your review. Also want to apologise for replying so late.
Today I'll be sending a v2 of this patch series after having applied 
all your suggestions.

On 28.03.2024 15:49, Liviu Dudau wrote:
> Hi Adrián,
> 
> Appologies for the delay in reviewing this.
> 
> On Tue, Mar 05, 2024 at 09:05:49PM +0000, Adrián Larumbe wrote:
> > These values are sampled by the firmware right before jumping into the UM
> > command stream and immediately after returning from it, and then kept inside a
> > per-job accounting structure. That structure is held inside the group's syncobjs
> > buffer object, at an offset that depends on the job's queue slot number and the
> > queue's index within the group.
> 
> I think this commit message is misleadingly short compared to the size of the
> changes. If I may, I would like to suggest that you split this commit into two
> parts, one introducing the changes in the ringbuf and syncobjs structures and
> the other exporting the statistics in the fdinfo.
> 
> > 
> > Signed-off-by: Adrián Larumbe <adrian.larumbe at collabora.com>
> > ---
> >  drivers/gpu/drm/panthor/panthor_devfreq.c |  10 +
> >  drivers/gpu/drm/panthor/panthor_device.h  |  11 ++
> >  drivers/gpu/drm/panthor/panthor_drv.c     |  31 ++++
> >  drivers/gpu/drm/panthor/panthor_sched.c   | 217 +++++++++++++++++++---
> >  4 files changed, 241 insertions(+), 28 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/panthor/panthor_devfreq.c b/drivers/gpu/drm/panthor/panthor_devfreq.c
> > index 7ac4fa290f27..51a7b734edcd 100644
> > --- a/drivers/gpu/drm/panthor/panthor_devfreq.c
> > +++ b/drivers/gpu/drm/panthor/panthor_devfreq.c
> > @@ -91,6 +91,7 @@ static int panthor_devfreq_get_dev_status(struct device *dev,
> >  	spin_lock_irqsave(&pdevfreq->lock, irqflags);
> >  
> >  	panthor_devfreq_update_utilization(pdevfreq);
> > +	ptdev->current_frequency = status->current_frequency;
> >  
> >  	status->total_time = ktime_to_ns(ktime_add(pdevfreq->busy_time,
> >  						   pdevfreq->idle_time));
> > @@ -130,6 +131,7 @@ int panthor_devfreq_init(struct panthor_device *ptdev)
> >  	struct panthor_devfreq *pdevfreq;
> >  	struct dev_pm_opp *opp;
> >  	unsigned long cur_freq;
> > +	unsigned long freq = ULONG_MAX;
> >  	int ret;
> >  
> >  	pdevfreq = drmm_kzalloc(&ptdev->base, sizeof(*ptdev->devfreq), GFP_KERNEL);
> > @@ -204,6 +206,14 @@ int panthor_devfreq_init(struct panthor_device *ptdev)
> >  
> >  	dev_pm_opp_put(opp);
> >  
> > +	/* Find the fastest defined rate  */
> > +	opp = dev_pm_opp_find_freq_floor(dev, &freq);
> > +	if (IS_ERR(opp))
> > +		return PTR_ERR(opp);
> > +	ptdev->fast_rate = freq;
> > +
> > +	dev_pm_opp_put(opp);
> > +
> >  	/*
> >  	 * Setup default thresholds for the simple_ondemand governor.
> >  	 * The values are chosen based on experiments.
> > diff --git a/drivers/gpu/drm/panthor/panthor_device.h b/drivers/gpu/drm/panthor/panthor_device.h
> > index 51c9d61b6796..10e970921ca3 100644
> > --- a/drivers/gpu/drm/panthor/panthor_device.h
> > +++ b/drivers/gpu/drm/panthor/panthor_device.h
> > @@ -162,6 +162,14 @@ struct panthor_device {
> >  		 */
> >  		u32 *dummy_latest_flush;
> >  	} pm;
> > +
> > +	unsigned long current_frequency;
> > +	unsigned long fast_rate;
> > +};
> > +
> > +struct panthor_gpu_usage {
> > +	u64 time;
> > +	u64 cycles;
> >  };
> >  
> >  /**
> > @@ -176,6 +184,9 @@ struct panthor_file {
> >  
> >  	/** @groups: Scheduling group pool attached to this file. */
> >  	struct panthor_group_pool *groups;
> > +
> > +	/** @stats: cycle and timestamp measures for job execution. */
> > +	struct panthor_gpu_usage stats;
> >  };
> >  
> >  int panthor_device_init(struct panthor_device *ptdev);
> > diff --git a/drivers/gpu/drm/panthor/panthor_drv.c b/drivers/gpu/drm/panthor/panthor_drv.c
> > index ff484506229f..fa06b9e2c6cd 100644
> > --- a/drivers/gpu/drm/panthor/panthor_drv.c
> > +++ b/drivers/gpu/drm/panthor/panthor_drv.c
> > @@ -3,6 +3,10 @@
> >  /* Copyright 2019 Linaro, Ltd., Rob Herring <robh at kernel.org> */
> >  /* Copyright 2019 Collabora ltd. */
> >  
> > +#ifdef CONFIG_HAVE_ARM_ARCH_TIMER
> > +#include <asm/arch_timer.h>
> > +#endif
> > +
> >  #include <linux/list.h>
> >  #include <linux/module.h>
> >  #include <linux/of_platform.h>
> > @@ -28,6 +32,8 @@
> >  #include "panthor_regs.h"
> >  #include "panthor_sched.h"
> >  
> > +#define NS_PER_SEC      1000000000ULL
> > +
> >  /**
> >   * DOC: user <-> kernel object copy helpers.
> >   */
> > @@ -1336,6 +1342,29 @@ static int panthor_mmap(struct file *filp, struct vm_area_struct *vma)
> >  	return ret;
> >  }
> >  
> > +static void panthor_gpu_show_fdinfo(struct panthor_device *ptdev,
> > +				    struct panthor_file *pfile,
> > +				    struct drm_printer *p)
> > +{
> > +#ifdef CONFIG_HAVE_ARM_ARCH_TIMER
> > +	drm_printf(p, "drm-engine-panthor:\t%llu ns\n",
> > +		   DIV_ROUND_UP_ULL((pfile->stats.time * NS_PER_SEC),
> > +				    arch_timer_get_cntfrq()));
> > +#endif
> > +	drm_printf(p, "drm-cycles-panthor:\t%llu\n", pfile->stats.cycles);
> > +	drm_printf(p, "drm-maxfreq-panthor:\t%lu Hz\n", ptdev->fast_rate);
> > +	drm_printf(p, "drm-curfreq-panthor:\t%lu Hz\n", ptdev->current_frequency);
> > +}
> > +
> > +static void panthor_show_fdinfo(struct drm_printer *p, struct drm_file *file)
> > +{
> > +	struct drm_device *dev = file->minor->dev;
> > +	struct panthor_device *ptdev = container_of(dev, struct panthor_device, base);
> > +
> > +	panthor_gpu_show_fdinfo(ptdev, file->driver_priv, p);
> > +
> > +}
> > +
> >  static const struct file_operations panthor_drm_driver_fops = {
> >  	.open = drm_open,
> >  	.release = drm_release,
> > @@ -1345,6 +1374,7 @@ static const struct file_operations panthor_drm_driver_fops = {
> >  	.read = drm_read,
> >  	.llseek = noop_llseek,
> >  	.mmap = panthor_mmap,
> > +	.show_fdinfo = drm_show_fdinfo,
> >  };
> >  
> >  #ifdef CONFIG_DEBUG_FS
> > @@ -1363,6 +1393,7 @@ static const struct drm_driver panthor_drm_driver = {
> >  			   DRIVER_SYNCOBJ_TIMELINE | DRIVER_GEM_GPUVA,
> >  	.open = panthor_open,
> >  	.postclose = panthor_postclose,
> > +	.show_fdinfo = panthor_show_fdinfo,
> >  	.ioctls = panthor_drm_driver_ioctls,
> >  	.num_ioctls = ARRAY_SIZE(panthor_drm_driver_ioctls),
> >  	.fops = &panthor_drm_driver_fops,
> > diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c
> > index 5f7803b6fc48..751d1453e7a1 100644
> > --- a/drivers/gpu/drm/panthor/panthor_sched.c
> > +++ b/drivers/gpu/drm/panthor/panthor_sched.c
> > @@ -93,6 +93,8 @@
> >  #define MIN_CSGS				3
> >  #define MAX_CSG_PRIO				0xf
> >  
> > +#define SLOTSIZE				(NUM_INSTRS_PER_SLOT * sizeof(u64))
> 
> Worth moving NUM_INSTRS_PER_SLOT here too?
> 
> > +
> >  struct panthor_group;
> >  
> >  /**
> > @@ -393,7 +395,13 @@ struct panthor_queue {
> >  #define CSF_MAX_QUEUE_PRIO	GENMASK(3, 0)
> >  
> >  	/** @ringbuf: Command stream ring-buffer. */
> > -	struct panthor_kernel_bo *ringbuf;
> > +	struct {
> > +		/** @ringbuf: Kernel BO that holds ring buffer. */
> > +		struct panthor_kernel_bo *bo;
> > +
> > +		/** @nelem: Number of slots in the ring buffer. */
> > +		unsigned int nelem;
> 
> I'm not convinced this nelem is needed. The only place it is used is to check that
> job->ringbuf_idx is not too big, which is something we should do in queue_run_job()
> function rather than in update_fdinfo_stats(). If we don't change ringbuf to a
> structure the patch slims down by more than a dozen lines.
> 
> > +	} ringbuf;
> >  
> >  	/** @iface: Firmware interface. */
> >  	struct {
> > @@ -466,6 +474,9 @@ struct panthor_queue {
> >  		 */
> >  		struct list_head in_flight_jobs;
> >  	} fence_ctx;
> > +
> > +	/** @time_offset: Offset of fdinfo stats structs in queue's syncobj. */
> > +	unsigned long time_offset;
> >  };
> >  
> >  /**
> > @@ -580,7 +591,26 @@ struct panthor_group {
> >  	 * One sync object per queue. The position of the sync object is
> >  	 * determined by the queue index.
> >  	 */
> > -	struct panthor_kernel_bo *syncobjs;
> > +
> > +	struct {
> > +		/** @bo: Kernel BO holding the sync objects. */
> > +		struct panthor_kernel_bo *bo;
> > +
> > +		/** @times_offset: Beginning of time stats after objects of sync pool. */
> > +		size_t times_offset;
> > +	} syncobjs;
> > +
> > +	/** @fdinfo: Per-file total cycle and timestamp values reference. */
> > +	struct {
> > +		/** @data: Pointer to actual per-file sample data. */
> > +		struct panthor_gpu_usage *data;
> > +
> > +		/**
> > +		 * @lock: Mutex to govern concurrent access from drm file's fdinfo callback
> > +		 * and job post-completion processing function
> > +		 */
> > +		struct mutex lock;
> > +	} fdinfo;
> >  
> >  	/** @state: Group state. */
> >  	enum panthor_group_state state;
> > @@ -639,6 +669,18 @@ struct panthor_group {
> >  	struct list_head wait_node;
> >  };
> >  
> > +struct panthor_job_times {
> > +	struct {
> > +		u64 before;
> > +		u64 after;
> > +	} cycles;
> > +
> > +	struct {
> > +		u64 before;
> > +		u64 after;
> > +	} time;
> > +};
> > +
> >  /**
> >   * group_queue_work() - Queue a group work
> >   * @group: Group to queue the work for.
> > @@ -718,6 +760,9 @@ struct panthor_job {
> >  	/** @queue_idx: Index of the queue inside @group. */
> >  	u32 queue_idx;
> >  
> > +	/** @ringbuf_idx: Index of the queue inside @queue. */
> 
> Index of the ringbuffer inside @queue.
> 
> > +	u32 ringbuf_idx;
> > +
> >  	/** @call_info: Information about the userspace command stream call. */
> >  	struct {
> >  		/** @start: GPU address of the userspace command stream. */
> > @@ -814,7 +859,7 @@ static void group_free_queue(struct panthor_group *group, struct panthor_queue *
> >  
> >  	panthor_queue_put_syncwait_obj(queue);
> >  
> > -	panthor_kernel_bo_destroy(group->vm, queue->ringbuf);
> > +	panthor_kernel_bo_destroy(group->vm, queue->ringbuf.bo);
> >  	panthor_kernel_bo_destroy(panthor_fw_vm(group->ptdev), queue->iface.mem);
> >  
> >  	kfree(queue);
> > @@ -828,12 +873,14 @@ static void group_release_work(struct work_struct *work)
> >  	struct panthor_device *ptdev = group->ptdev;
> >  	u32 i;
> >  
> > +	mutex_destroy(&group->fdinfo.lock);
> > +
> >  	for (i = 0; i < group->queue_count; i++)
> >  		group_free_queue(group, group->queues[i]);
> >  
> >  	panthor_kernel_bo_destroy(panthor_fw_vm(ptdev), group->suspend_buf);
> >  	panthor_kernel_bo_destroy(panthor_fw_vm(ptdev), group->protm_suspend_buf);
> > -	panthor_kernel_bo_destroy(group->vm, group->syncobjs);
> > +	panthor_kernel_bo_destroy(group->vm, group->syncobjs.bo);
> >  
> >  	panthor_vm_put(group->vm);
> >  	kfree(group);
> > @@ -970,8 +1017,8 @@ cs_slot_prog_locked(struct panthor_device *ptdev, u32 csg_id, u32 cs_id)
> >  	queue->iface.input->extract = queue->iface.output->extract;
> >  	drm_WARN_ON(&ptdev->base, queue->iface.input->insert < queue->iface.input->extract);
> >  
> > -	cs_iface->input->ringbuf_base = panthor_kernel_bo_gpuva(queue->ringbuf);
> > -	cs_iface->input->ringbuf_size = panthor_kernel_bo_size(queue->ringbuf);
> > +	cs_iface->input->ringbuf_base = panthor_kernel_bo_gpuva(queue->ringbuf.bo);
> > +	cs_iface->input->ringbuf_size = panthor_kernel_bo_size(queue->ringbuf.bo);
> >  	cs_iface->input->ringbuf_input = queue->iface.input_fw_va;
> >  	cs_iface->input->ringbuf_output = queue->iface.output_fw_va;
> >  	cs_iface->input->config = CS_CONFIG_PRIORITY(queue->priority) |
> > @@ -1926,7 +1973,7 @@ tick_ctx_init(struct panthor_scheduler *sched,
> >  	}
> >  }
> >  
> > -#define NUM_INSTRS_PER_SLOT		16
> > +#define NUM_INSTRS_PER_SLOT		32
> 
> I guess this macro has to be a value that is a power of 2, as it used to divide the ringbuffer size, right?
> 
> >  
> >  static void
> >  group_term_post_processing(struct panthor_group *group)
> > @@ -1964,7 +2011,7 @@ group_term_post_processing(struct panthor_group *group)
> >  		spin_unlock(&queue->fence_ctx.lock);
> >  
> >  		/* Manually update the syncobj seqno to unblock waiters. */
> > -		syncobj = group->syncobjs->kmap + (i * sizeof(*syncobj));
> > +		syncobj = group->syncobjs.bo->kmap + (i * sizeof(*syncobj));
> >  		syncobj->status = ~0;
> >  		syncobj->seqno = atomic64_read(&queue->fence_ctx.seqno);
> >  		sched_queue_work(group->ptdev->scheduler, sync_upd);
> > @@ -2715,6 +2762,30 @@ void panthor_sched_post_reset(struct panthor_device *ptdev)
> >  	sched_queue_work(sched, sync_upd);
> >  }
> >  
> > +static void update_fdinfo_stats(struct panthor_job *job)
> > +{
> > +	struct panthor_group *group = job->group;
> > +	struct panthor_queue *queue = group->queues[job->queue_idx];
> > +	struct panthor_device *ptdev = group->ptdev;
> > +	struct panthor_gpu_usage *fdinfo;
> > +	struct panthor_job_times *times;
> > +
> > +	if (drm_WARN_ON(&ptdev->base, job->ringbuf_idx >= queue->ringbuf.nelem))
> > +		return;
> > +
> > +	times = (struct panthor_job_times *)
> > +		((unsigned long)group->syncobjs.bo->kmap + queue->time_offset +
> > +		 (job->ringbuf_idx * sizeof(struct panthor_job_times)));
> > +
> > +	mutex_lock(&group->fdinfo.lock);
> > +	if ((group->fdinfo.data)) {
> > +		fdinfo = group->fdinfo.data;
> > +		fdinfo->cycles += times->cycles.after - times->cycles.before;
> > +		fdinfo->time += times->time.after - times->time.before;
> > +	}
> > +	mutex_unlock(&group->fdinfo.lock);
> > +}
> > +
> >  static void group_sync_upd_work(struct work_struct *work)
> >  {
> >  	struct panthor_group *group =
> > @@ -2732,7 +2803,7 @@ static void group_sync_upd_work(struct work_struct *work)
> >  		if (!queue)
> >  			continue;
> >  
> > -		syncobj = group->syncobjs->kmap + (queue_idx * sizeof(*syncobj));
> > +		syncobj = group->syncobjs.bo->kmap + (queue_idx * sizeof(*syncobj));
> >  
> >  		spin_lock(&queue->fence_ctx.lock);
> >  		list_for_each_entry_safe(job, job_tmp, &queue->fence_ctx.in_flight_jobs, node) {
> > @@ -2750,6 +2821,7 @@ static void group_sync_upd_work(struct work_struct *work)
> >  	dma_fence_end_signalling(cookie);
> >  
> >  	list_for_each_entry_safe(job, job_tmp, &done_jobs, node) {
> > +		update_fdinfo_stats(job);
> >  		list_del_init(&job->node);
> >  		panthor_job_put(&job->base);
> >  	}
> > @@ -2765,13 +2837,19 @@ queue_run_job(struct drm_sched_job *sched_job)
> >  	struct panthor_queue *queue = group->queues[job->queue_idx];
> >  	struct panthor_device *ptdev = group->ptdev;
> >  	struct panthor_scheduler *sched = ptdev->scheduler;
> > -	u32 ringbuf_size = panthor_kernel_bo_size(queue->ringbuf);
> > +	u32 ringbuf_size = panthor_kernel_bo_size(queue->ringbuf.bo);
> >  	u32 ringbuf_insert = queue->iface.input->insert & (ringbuf_size - 1);
> > +	u32 ringbuf_index = ringbuf_insert / (SLOTSIZE);
> >  	u64 addr_reg = ptdev->csif_info.cs_reg_count -
> >  		       ptdev->csif_info.unpreserved_cs_reg_count;
> >  	u64 val_reg = addr_reg + 2;
> > -	u64 sync_addr = panthor_kernel_bo_gpuva(group->syncobjs) +
> > -			job->queue_idx * sizeof(struct panthor_syncobj_64b);
> > +	u64 cycle_reg = addr_reg;
> > +	u64 time_reg = val_reg;
> > +	u64 sync_addr = panthor_kernel_bo_gpuva(group->syncobjs.bo) +
> > +		job->queue_idx * sizeof(struct panthor_syncobj_64b);
> > +	u64 times_addr = panthor_kernel_bo_gpuva(group->syncobjs.bo) + queue->time_offset +
> > +		(ringbuf_index * sizeof(struct panthor_job_times));
> > +
> >  	u32 waitall_mask = GENMASK(sched->sb_slot_count - 1, 0);
> >  	struct dma_fence *done_fence;
> >  	int ret;
> > @@ -2783,6 +2861,18 @@ queue_run_job(struct drm_sched_job *sched_job)
> >  		/* FLUSH_CACHE2.clean_inv_all.no_wait.signal(0) rX+2 */
> >  		(36ull << 56) | (0ull << 48) | (val_reg << 40) | (0 << 16) | 0x233,
> >  
> > +		/* MOV48 rX:rX+1, cycles_offset */
> > +		(1ull << 56) | (cycle_reg << 48) | (times_addr + offsetof(struct panthor_job_times, cycles.before)),
> > +
> > +		/* MOV48 rX:rX+1, time_offset */
> > +		(1ull << 56) | (time_reg << 48) | (times_addr + offsetof(struct panthor_job_times, time.before)),
> > +
> > +		/* STORE_STATE cycles */
> > +		(40ull << 56) |  (cycle_reg << 40) | (1ll << 32),
> > +
> > +		/* STORE_STATE timer */
> > +		(40ull << 56) |  (time_reg << 40) | (0ll << 32),
> > +
> >  		/* MOV48 rX:rX+1, cs.start */
> >  		(1ull << 56) | (addr_reg << 48) | job->call_info.start,
> >  
> > @@ -2795,6 +2885,18 @@ queue_run_job(struct drm_sched_job *sched_job)
> >  		/* CALL rX:rX+1, rX+2 */
> >  		(32ull << 56) | (addr_reg << 40) | (val_reg << 32),
> >  
> > +		/* MOV48 rX:rX+1, cycles_offset */
> > +		(1ull << 56) | (cycle_reg << 48) | (times_addr + offsetof(struct panthor_job_times, cycles.after)),
> > +
> > +		/* MOV48 rX:rX+1, time_offset */
> > +		(1ull << 56) | (time_reg << 48) | (times_addr + offsetof(struct panthor_job_times, time.after)),
> > +
> > +		/* STORE_STATE cycles */
> > +		(40ull << 56) |  (cycle_reg << 40) | (1ll << 32),
> > +
> > +		/* STORE_STATE timer */
> > +		(40ull << 56) |  (time_reg << 40) | (0ll << 32),
> > +
> >  		/* MOV48 rX:rX+1, sync_addr */
> >  		(1ull << 56) | (addr_reg << 48) | sync_addr,
> >  
> > @@ -2839,7 +2941,7 @@ queue_run_job(struct drm_sched_job *sched_job)
> >  		       queue->fence_ctx.id,
> >  		       atomic64_inc_return(&queue->fence_ctx.seqno));
> >  
> > -	memcpy(queue->ringbuf->kmap + ringbuf_insert,
> > +	memcpy(queue->ringbuf.bo->kmap + ringbuf_insert,
> >  	       call_instrs, sizeof(call_instrs));
> >  
> >  	panthor_job_get(&job->base);
> > @@ -2849,6 +2951,7 @@ queue_run_job(struct drm_sched_job *sched_job)
> >  
> >  	job->ringbuf.start = queue->iface.input->insert;
> >  	job->ringbuf.end = job->ringbuf.start + sizeof(call_instrs);
> > +	job->ringbuf_idx = ringbuf_index;
> >  
> >  	/* Make sure the ring buffer is updated before the INSERT
> >  	 * register.
> > @@ -2939,7 +3042,8 @@ static const struct drm_sched_backend_ops panthor_queue_sched_ops = {
> >  
> >  static struct panthor_queue *
> >  group_create_queue(struct panthor_group *group,
> > -		   const struct drm_panthor_queue_create *args)
> > +		   const struct drm_panthor_queue_create *args,
> > +		   unsigned int slots_so_far)
> >  {
> >  	struct drm_gpu_scheduler *drm_sched;
> >  	struct panthor_queue *queue;
> > @@ -2965,21 +3069,23 @@ group_create_queue(struct panthor_group *group,
> >  
> >  	queue->priority = args->priority;
> >  
> > -	queue->ringbuf = panthor_kernel_bo_create(group->ptdev, group->vm,
> > +	queue->ringbuf.bo = panthor_kernel_bo_create(group->ptdev, group->vm,
> >  						  args->ringbuf_size,
> >  						  DRM_PANTHOR_BO_NO_MMAP,
> >  						  DRM_PANTHOR_VM_BIND_OP_MAP_NOEXEC |
> >  						  DRM_PANTHOR_VM_BIND_OP_MAP_UNCACHED,
> >  						  PANTHOR_VM_KERNEL_AUTO_VA);
> > -	if (IS_ERR(queue->ringbuf)) {
> > -		ret = PTR_ERR(queue->ringbuf);
> > +	if (IS_ERR(queue->ringbuf.bo)) {
> > +		ret = PTR_ERR(queue->ringbuf.bo);
> >  		goto err_free_queue;
> >  	}
> >  
> > -	ret = panthor_kernel_bo_vmap(queue->ringbuf);
> > +	ret = panthor_kernel_bo_vmap(queue->ringbuf.bo);
> >  	if (ret)
> >  		goto err_free_queue;
> >  
> > +	queue->ringbuf.nelem = (args->ringbuf_size / (SLOTSIZE));
> > +
> >  	queue->iface.mem = panthor_fw_alloc_queue_iface_mem(group->ptdev,
> >  							    &queue->iface.input,
> >  							    &queue->iface.output,
> > @@ -2990,6 +3096,9 @@ group_create_queue(struct panthor_group *group,
> >  		goto err_free_queue;
> >  	}
> >  
> > +	queue->time_offset = group->syncobjs.times_offset +
> > +		(slots_so_far * sizeof(struct panthor_job_times));
> > +
> >  	ret = drm_sched_init(&queue->scheduler, &panthor_queue_sched_ops,
> >  			     group->ptdev->scheduler->wq, 1,
> >  			     args->ringbuf_size / (NUM_INSTRS_PER_SLOT * sizeof(u64)),
> 
> You can use the newly added SLOTSIZE here.
> 
> > @@ -3020,6 +3129,7 @@ int panthor_group_create(struct panthor_file *pfile,
> >  	struct panthor_scheduler *sched = ptdev->scheduler;
> >  	struct panthor_fw_csg_iface *csg_iface = panthor_fw_get_csg_iface(ptdev, 0);
> >  	struct panthor_group *group = NULL;
> > +	unsigned int total_slots;
> >  	u32 gid, i, suspend_size;
> >  	int ret;
> >  
> > @@ -3086,33 +3196,77 @@ int panthor_group_create(struct panthor_file *pfile,
> >  		goto err_put_group;
> >  	}
> >  
> > -	group->syncobjs = panthor_kernel_bo_create(ptdev, group->vm,
> > -						   group_args->queues.count *
> > -						   sizeof(struct panthor_syncobj_64b),
> > +	/*
> > +	 * Need to add size for the fdinfo sample structs, as many as the sum
> > +	 * of the number of job slots for every single queue ringbuffer.
> > +	 */
> > +
> > +	for (i = 0, total_slots = 0; i < group_args->queues.count; i++)
> > +		total_slots += (queue_args[i].ringbuf_size / (SLOTSIZE));
> 
> Minor nit: We should pre-compute here (group_args->queues.count * sizeof(struct panthor_syncobj_64b)) + \
> total_slots * sizeof(struct panthor_job_times) and then use it later as argument to panthor_kernel_bo_create()
> and memset().
> 
> > +
> > +	/*
> > +	 * Memory layout of group's syncobjs BO
> > +	 * group->syncobjs.bo {
> > +	 *	struct panthor_syncobj_64b sync1;
> > +	 *	struct panthor_syncobj_64b sync2;
> > +	 *		...
> > +	 *		As many as group_args->queues.count
> > +	 *		...
> > +	 *	struct panthor_syncobj_64b syncn;
> > +	 *	struct panthor_job_times queue_1slot_1
> > +	 *	struct panthor_job_times queue_1slot_2
> > +	 *		...
> > +	 *		As many as queue[i].ringbuf_size / SLOTSIZE
> > +	 *		...
> > +	 *	struct panthor_job_times queue_1slot_p
> > +	 *		...
> > +	 *		As many as group_args->queues.count
> > +	 *		...
> > +	 *	struct panthor_job_times queue_nslot_1
> > +	 *	struct panthor_job_times queue_nslot_2
> > +	 *		...
> > +	 *		As many as queue[n].ringbuf_size / SLOTSIZE
> > +	 *	struct panthor_job_times queue_nslot_q
> 
> Minor nit: I find it more readable the form "queue1_slotP"... "queueN_slotQ".
> 
> > +	 *
> > +	 *	Linearly, group->syncobjs.bo = {syncojb1,..,syncobjN,
> > +	 *	{queue1 = {js1,..,jsp},..,queueN = {js1,..,jsq}}}
> > +	 * }
> > +	 *
> > +	 */
> > +
> > +	group->syncobjs.bo = panthor_kernel_bo_create(ptdev, group->vm,
> > +						   (group_args->queues.count *
> > +						    sizeof(struct panthor_syncobj_64b))
> > +						   + (total_slots * sizeof(struct panthor_job_times)),
> >  						   DRM_PANTHOR_BO_NO_MMAP,
> >  						   DRM_PANTHOR_VM_BIND_OP_MAP_NOEXEC |
> >  						   DRM_PANTHOR_VM_BIND_OP_MAP_UNCACHED,
> >  						   PANTHOR_VM_KERNEL_AUTO_VA);
> > -	if (IS_ERR(group->syncobjs)) {
> > -		ret = PTR_ERR(group->syncobjs);
> > +	if (IS_ERR(group->syncobjs.bo)) {
> > +		ret = PTR_ERR(group->syncobjs.bo);
> >  		goto err_put_group;
> >  	}
> >  
> > -	ret = panthor_kernel_bo_vmap(group->syncobjs);
> > +	ret = panthor_kernel_bo_vmap(group->syncobjs.bo);
> >  	if (ret)
> >  		goto err_put_group;
> >  
> > -	memset(group->syncobjs->kmap, 0,
> > -	       group_args->queues.count * sizeof(struct panthor_syncobj_64b));
> > +	memset(group->syncobjs.bo->kmap, 0,
> > +	       (group_args->queues.count * sizeof(struct panthor_syncobj_64b)) +
> > +	       (total_slots * sizeof(struct panthor_job_times)));
> >  
> > -	for (i = 0; i < group_args->queues.count; i++) {
> > -		group->queues[i] = group_create_queue(group, &queue_args[i]);
> > +	group->syncobjs.times_offset =
> > +		group_args->queues.count * sizeof(struct panthor_syncobj_64b);
> > +
> > +	for (i = 0, total_slots = 0; i < group_args->queues.count; i++) {
> > +		group->queues[i] = group_create_queue(group, &queue_args[i], total_slots);
> >  		if (IS_ERR(group->queues[i])) {
> >  			ret = PTR_ERR(group->queues[i]);
> >  			group->queues[i] = NULL;
> >  			goto err_put_group;
> >  		}
> >  
> > +		total_slots += (queue_args[i].ringbuf_size / (SLOTSIZE));
> >  		group->queue_count++;
> >  	}
> >  
> > @@ -3133,6 +3287,9 @@ int panthor_group_create(struct panthor_file *pfile,
> >  	}
> >  	mutex_unlock(&sched->reset.lock);
> >  
> > +	group->fdinfo.data = &pfile->stats;
> > +	mutex_init(&group->fdinfo.lock);
> > +
> >  	return gid;
> >  
> >  err_put_group:
> > @@ -3172,6 +3329,10 @@ int panthor_group_destroy(struct panthor_file *pfile, u32 group_handle)
> >  	mutex_unlock(&sched->lock);
> >  	mutex_unlock(&sched->reset.lock);
> >  
> > +	mutex_lock(&group->fdinfo.lock);
> > +	group->fdinfo.data = NULL;
> > +	mutex_unlock(&group->fdinfo.lock);
> > +
> >  	group_put(group);
> >  	return 0;
> >  }
> > -- 
> > 2.43.0
> >
> 
> I've tried to review the patch as best as I could, specially the math. AFAICT it all checks out,
> would be good for others to have a look.
> 
> Best regards,
> Liviu
> 
> -- 
> ====================
> | I would like to |
> | fix the world,  |
> | but they're not |
> | giving me the   |
>  \ source code!  /
>   ---------------
>     ¯\_(ツ)_/¯