[PATCH v7 1/5] drm/panthor: introduce job cycle and timestamp accounting
Steven Price
steven.price at arm.com
Mon Sep 23 09:07:14 UTC 2024
On 21/09/2024 00:43, Adrián Larumbe wrote:
> Enable calculations of job submission times in clock cycles and wall
> time. This is done by expanding the boilerplate command stream when running
> a job to include instructions that compute said times right before and
> after a user CS.
>
> A separate kernel BO is created per queue to store those values. Jobs can
> access their sampled data through an index different from that of the
> queue's ringbuffer. The reason for this is saving memory on the profiling
> information kernel BO, since the amount of simultaneous profiled jobs we
> can write into the queue's ringbuffer might be much smaller than for
> regular jobs, as the former take more CSF instructions.
>
> This commit is done in preparation for enabling DRM fdinfo support in the
> Panthor driver, which depends on the numbers calculated herein.
>
> A profile mode mask has been added that will in a future commit allow UM to
> toggle performance metric sampling behaviour, which is disabled by default
> to save power. When a ringbuffer CS is constructed, timestamp and cycling
> sampling instructions are added depending on the enabled flags in the
> profiling mask.
>
> A helper was provided that calculates the number of instructions for a
> given set of enablement mask, and these are passed as the number of credits
> when initialising a DRM scheduler job.
>
> Signed-off-by: Adrián Larumbe <adrian.larumbe at collabora.com>
> Reviewed-by: Boris Brezillon <boris.brezillon at collabora.com>
> Reviewed-by: Liviu Dudau <liviu.dudau at arm.com>
I think just one bug remaining - see below...
> ---
> drivers/gpu/drm/panthor/panthor_device.h | 22 ++
> drivers/gpu/drm/panthor/panthor_sched.c | 328 +++++++++++++++++++----
> 2 files changed, 301 insertions(+), 49 deletions(-)
>
> diff --git a/drivers/gpu/drm/panthor/panthor_device.h b/drivers/gpu/drm/panthor/panthor_device.h
> index e388c0472ba7..a48e30d0af30 100644
> --- a/drivers/gpu/drm/panthor/panthor_device.h
> +++ b/drivers/gpu/drm/panthor/panthor_device.h
> @@ -66,6 +66,25 @@ struct panthor_irq {
> atomic_t suspended;
> };
>
> +/**
> + * enum panthor_device_profiling_mode - Profiling state
> + */
> +enum panthor_device_profiling_flags {
> + /** @PANTHOR_DEVICE_PROFILING_DISABLED: Profiling is disabled. */
> + PANTHOR_DEVICE_PROFILING_DISABLED = 0,
> +
> + /** @PANTHOR_DEVICE_PROFILING_CYCLES: Sampling job cycles. */
> + PANTHOR_DEVICE_PROFILING_CYCLES = BIT(0),
> +
> + /** @PANTHOR_DEVICE_PROFILING_TIMESTAMP: Sampling job timestamp. */
> + PANTHOR_DEVICE_PROFILING_TIMESTAMP = BIT(1),
> +
> + /** @PANTHOR_DEVICE_PROFILING_ALL: Sampling everything. */
> + PANTHOR_DEVICE_PROFILING_ALL =
> + PANTHOR_DEVICE_PROFILING_CYCLES |
> + PANTHOR_DEVICE_PROFILING_TIMESTAMP,
> +};
> +
> /**
> * struct panthor_device - Panthor device
> */
> @@ -162,6 +181,9 @@ struct panthor_device {
> */
> struct page *dummy_latest_flush;
> } pm;
> +
> + /** @profile_mask: User-set profiling flags for job accounting. */
> + u32 profile_mask;
> };
>
> /**
> diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c
> index 42afdf0ddb7e..6da5c3d0015e 100644
> --- a/drivers/gpu/drm/panthor/panthor_sched.c
> +++ b/drivers/gpu/drm/panthor/panthor_sched.c
> @@ -93,6 +93,9 @@
> #define MIN_CSGS 3
> #define MAX_CSG_PRIO 0xf
>
> +#define NUM_INSTRS_PER_CACHE_LINE (64 / sizeof(u64))
> +#define MAX_INSTRS_PER_JOB 24
> +
> struct panthor_group;
>
> /**
> @@ -476,6 +479,18 @@ struct panthor_queue {
> */
> struct list_head in_flight_jobs;
> } fence_ctx;
> +
> + /** @profiling: Job profiling data slots and access information. */
> + struct {
> + /** @slots: Kernel BO holding the slots. */
> + struct panthor_kernel_bo *slots;
> +
> + /** @slot_count: Number of jobs ringbuffer can hold at once. */
> + u32 slot_count;
> +
> + /** @seqno: Index of the next available profiling information slot. */
> + u32 seqno;
> + } profiling;
> };
>
> /**
> @@ -661,6 +676,18 @@ struct panthor_group {
> struct list_head wait_node;
> };
>
> +struct panthor_job_profiling_data {
> + struct {
> + u64 before;
> + u64 after;
> + } cycles;
> +
> + struct {
> + u64 before;
> + u64 after;
> + } time;
> +};
> +
> /**
> * group_queue_work() - Queue a group work
> * @group: Group to queue the work for.
> @@ -774,6 +801,15 @@ struct panthor_job {
>
> /** @done_fence: Fence signaled when the job is finished or cancelled. */
> struct dma_fence *done_fence;
> +
> + /** @profiling: Job profiling information. */
> + struct {
> + /** @mask: Current device job profiling enablement bitmask. */
> + u32 mask;
> +
> + /** @slot: Job index in the profiling slots BO. */
> + u32 slot;
> + } profiling;
> };
>
> static void
> @@ -838,6 +874,7 @@ static void group_free_queue(struct panthor_group *group, struct panthor_queue *
>
> panthor_kernel_bo_destroy(queue->ringbuf);
> panthor_kernel_bo_destroy(queue->iface.mem);
> + panthor_kernel_bo_destroy(queue->profiling.slots);
>
> /* Release the last_fence we were holding, if any. */
> dma_fence_put(queue->fence_ctx.last_fence);
> @@ -1982,8 +2019,6 @@ tick_ctx_init(struct panthor_scheduler *sched,
> }
> }
>
> -#define NUM_INSTRS_PER_SLOT 16
> -
> static void
> group_term_post_processing(struct panthor_group *group)
> {
> @@ -2815,65 +2850,192 @@ static void group_sync_upd_work(struct work_struct *work)
> group_put(group);
> }
>
> -static struct dma_fence *
> -queue_run_job(struct drm_sched_job *sched_job)
> +struct panthor_job_ringbuf_instrs {
> + u64 buffer[MAX_INSTRS_PER_JOB];
> + u32 count;
> +};
> +
> +struct panthor_job_instr {
> + u32 profile_mask;
> + u64 instr;
> +};
> +
> +#define JOB_INSTR(__prof, __instr) \
> + { \
> + .profile_mask = __prof, \
> + .instr = __instr, \
> + }
> +
> +static void
> +copy_instrs_to_ringbuf(struct panthor_queue *queue,
> + struct panthor_job *job,
> + struct panthor_job_ringbuf_instrs *instrs)
> +{
> + u64 ringbuf_size = panthor_kernel_bo_size(queue->ringbuf);
> + u64 start = job->ringbuf.start & (ringbuf_size - 1);
> + u64 size, written;
> +
> + /*
> + * We need to write a whole slot, including any trailing zeroes
> + * that may come at the end of it. Also, because instrs.buffer has
> + * been zero-initialised, there's no need to pad it with 0's
> + */
> + instrs->count = ALIGN(instrs->count, NUM_INSTRS_PER_CACHE_LINE);
> + size = instrs->count * sizeof(u64);
> + WARN_ON(size > ringbuf_size);
> + written = min(ringbuf_size - start, size);
> +
> + memcpy(queue->ringbuf->kmap + start, instrs->buffer, written);
> +
> + if (written < size)
> + memcpy(queue->ringbuf->kmap,
> + &instrs->buffer[written/sizeof(u64)],
> + size - written);
> +}
> +
> +struct panthor_job_cs_params {
> + u32 profile_mask;
> + u64 addr_reg; u64 val_reg;
> + u64 cycle_reg; u64 time_reg;
> + u64 sync_addr; u64 times_addr;
> + u64 cs_start; u64 cs_size;
> + u32 last_flush; u32 waitall_mask;
> +};
> +
> +static void
> +get_job_cs_params(struct panthor_job *job, struct panthor_job_cs_params *params)
> {
> - struct panthor_job *job = container_of(sched_job, struct panthor_job, base);
> struct panthor_group *group = job->group;
> struct panthor_queue *queue = group->queues[job->queue_idx];
> struct panthor_device *ptdev = group->ptdev;
> struct panthor_scheduler *sched = ptdev->scheduler;
> - u32 ringbuf_size = panthor_kernel_bo_size(queue->ringbuf);
> - u32 ringbuf_insert = queue->iface.input->insert & (ringbuf_size - 1);
> - u64 addr_reg = ptdev->csif_info.cs_reg_count -
> - ptdev->csif_info.unpreserved_cs_reg_count;
> - u64 val_reg = addr_reg + 2;
> - u64 sync_addr = panthor_kernel_bo_gpuva(group->syncobjs) +
> - job->queue_idx * sizeof(struct panthor_syncobj_64b);
> - u32 waitall_mask = GENMASK(sched->sb_slot_count - 1, 0);
> - struct dma_fence *done_fence;
> - int ret;
>
> - u64 call_instrs[NUM_INSTRS_PER_SLOT] = {
> - /* MOV32 rX+2, cs.latest_flush */
> - (2ull << 56) | (val_reg << 48) | job->call_info.latest_flush,
> + params->addr_reg = ptdev->csif_info.cs_reg_count -
> + ptdev->csif_info.unpreserved_cs_reg_count;
> + params->val_reg = params->addr_reg + 2;
> + params->cycle_reg = params->addr_reg;
> + params->time_reg = params->val_reg;
>
> - /* FLUSH_CACHE2.clean_inv_all.no_wait.signal(0) rX+2 */
> - (36ull << 56) | (0ull << 48) | (val_reg << 40) | (0 << 16) | 0x233,
> + params->sync_addr = panthor_kernel_bo_gpuva(group->syncobjs) +
> + job->queue_idx * sizeof(struct panthor_syncobj_64b);
> + params->times_addr = panthor_kernel_bo_gpuva(queue->profiling.slots) +
> + (job->profiling.slot * sizeof(struct panthor_job_profiling_data));
> + params->waitall_mask = GENMASK(sched->sb_slot_count - 1, 0);
>
> - /* MOV48 rX:rX+1, cs.start */
> - (1ull << 56) | (addr_reg << 48) | job->call_info.start,
> + params->cs_start = job->call_info.start;
> + params->cs_size = job->call_info.size;
> + params->last_flush = job->call_info.latest_flush;
>
> - /* MOV32 rX+2, cs.size */
> - (2ull << 56) | (val_reg << 48) | job->call_info.size,
> + params->profile_mask = job->profiling.mask;
> +}
>
> - /* WAIT(0) => waits for FLUSH_CACHE2 instruction */
> - (3ull << 56) | (1 << 16),
> +#define JOB_INSTR_ALWAYS(instr) \
> + JOB_INSTR(PANTHOR_DEVICE_PROFILING_DISABLED, (instr))
> +#define JOB_INSTR_TIMESTAMP(instr) \
> + JOB_INSTR(PANTHOR_DEVICE_PROFILING_TIMESTAMP, (instr))
> +#define JOB_INSTR_CYCLES(instr) \
> + JOB_INSTR(PANTHOR_DEVICE_PROFILING_CYCLES, (instr))
>
> +static void
> +prepare_job_instrs(const struct panthor_job_cs_params *params,
> + struct panthor_job_ringbuf_instrs *instrs)
> +{
> + const struct panthor_job_instr instr_seq[] = {
> + /* MOV32 rX+2, cs.latest_flush */
> + JOB_INSTR_ALWAYS((2ull << 56) | (params->val_reg << 48) | params->last_flush),
> + /* FLUSH_CACHE2.clean_inv_all.no_wait.signal(0) rX+2 */
> + JOB_INSTR_ALWAYS((36ull << 56) | (0ull << 48) | (params->val_reg << 40) | (0 << 16) | 0x233),
> + /* MOV48 rX:rX+1, cycles_offset */
> + JOB_INSTR_CYCLES((1ull << 56) | (params->cycle_reg << 48) |
> + (params->times_addr + offsetof(struct panthor_job_profiling_data, cycles.before))),
> + /* STORE_STATE cycles */
> + JOB_INSTR_CYCLES((40ull << 56) | (params->cycle_reg << 40) | (1ll << 32)),
> + /* MOV48 rX:rX+1, time_offset */
> + JOB_INSTR_TIMESTAMP((1ull << 56) | (params->time_reg << 48) | (params->times_addr +
> + offsetof(struct panthor_job_profiling_data, time.before))),
> + /* STORE_STATE timer */
> + JOB_INSTR_TIMESTAMP((40ull << 56) | (params->time_reg << 40) | (0ll << 32)),
> + /* MOV48 rX:rX+1, cs.start */
> + JOB_INSTR_ALWAYS((1ull << 56) | (params->addr_reg << 48) | params->cs_start),
> + /* MOV32 rX+2, cs.size */
> + JOB_INSTR_ALWAYS((2ull << 56) | (params->val_reg << 48) | params->cs_size),
> + /* WAIT(0) => waits for FLUSH_CACHE2 instruction */
> + JOB_INSTR_ALWAYS((3ull << 56) | (1 << 16)),
> /* CALL rX:rX+1, rX+2 */
> - (32ull << 56) | (addr_reg << 40) | (val_reg << 32),
> -
> + JOB_INSTR_ALWAYS((32ull << 56) | (params->addr_reg << 40) | (params->val_reg << 32)),
> + /* MOV48 rX:rX+1, cycles_offset */
> + JOB_INSTR_CYCLES((1ull << 56) | (params->cycle_reg << 48) |
> + (params->times_addr + offsetof(struct panthor_job_profiling_data, cycles.after))),
> + /* STORE_STATE cycles */
> + JOB_INSTR_CYCLES((40ull << 56) | (params->cycle_reg << 40) | (1ll << 32)),
> + /* MOV48 rX:rX+1, time_offset */
> + JOB_INSTR_TIMESTAMP((1ull << 56) | (params->time_reg << 48) |
> + (params->times_addr + offsetof(struct panthor_job_profiling_data, time.after))),
> + /* STORE_STATE timer */
> + JOB_INSTR_TIMESTAMP((40ull << 56) | (params->time_reg << 40) | (0ll << 32)),
> /* MOV48 rX:rX+1, sync_addr */
> - (1ull << 56) | (addr_reg << 48) | sync_addr,
> -
> + JOB_INSTR_ALWAYS((1ull << 56) | (params->addr_reg << 48) | params->sync_addr),
> /* MOV48 rX+2, #1 */
> - (1ull << 56) | (val_reg << 48) | 1,
> -
> + JOB_INSTR_ALWAYS((1ull << 56) | (params->val_reg << 48) | 1),
> /* WAIT(all) */
> - (3ull << 56) | (waitall_mask << 16),
> -
> + JOB_INSTR_ALWAYS((3ull << 56) | (params->waitall_mask << 16)),
> /* SYNC_ADD64.system_scope.propage_err.nowait rX:rX+1, rX+2*/
> - (51ull << 56) | (0ull << 48) | (addr_reg << 40) | (val_reg << 32) | (0 << 16) | 1,
> + JOB_INSTR_ALWAYS((51ull << 56) | (0ull << 48) | (params->addr_reg << 40) |
> + (params->val_reg << 32) | (0 << 16) | 1),
> + /* ERROR_BARRIER, so we can recover from faults at job boundaries. */
> + JOB_INSTR_ALWAYS((47ull << 56)),
> + };
> + u32 pad;
>
> - /* ERROR_BARRIER, so we can recover from faults at job
> - * boundaries.
> - */
> - (47ull << 56),
> + /* NEED to be cacheline aligned to please the prefetcher. */
> + static_assert(sizeof(instrs->buffer) % 64 == 0,
> + "panthor_job_ringbuf_instrs::buffer is not aligned on a cacheline");
> +
> + /* Make sure we have enough storage to store the whole sequence. */
> + static_assert(ALIGN(ARRAY_SIZE(instr_seq), NUM_INSTRS_PER_CACHE_LINE) ==
> + ARRAY_SIZE(instrs->buffer),
> + "instr_seq vs panthor_job_ringbuf_instrs::buffer size mismatch");
> +
> + for (u32 i = 0; i < ARRAY_SIZE(instr_seq); i++) {
> + /* If the profile mask of this instruction is not enabled, skip it. */
> + if (instr_seq[i].profile_mask &&
> + !(instr_seq[i].profile_mask & params->profile_mask))
> + continue;
> +
> + instrs->buffer[instrs->count++] = instr_seq[i].instr;
> + }
> +
> + pad = ALIGN(instrs->count, NUM_INSTRS_PER_CACHE_LINE);
> + memset(&instrs->buffer[instrs->count], 0,
> + (pad - instrs->count) * sizeof(instrs->buffer[0]));
> + instrs->count = pad;
> +}
> +
> +static u32 calc_job_credits(u32 profile_mask)
> +{
> + struct panthor_job_ringbuf_instrs instrs = {
> + .count = 0,
> + };
> + struct panthor_job_cs_params params = {
> + .profile_mask = profile_mask,
> };
>
> - /* Need to be cacheline aligned to please the prefetcher. */
> - static_assert(sizeof(call_instrs) % 64 == 0,
> - "call_instrs is not aligned on a cacheline");
> + prepare_job_instrs(¶ms, &instrs);
> + return instrs.count;
> +}
> +
> +static struct dma_fence *
> +queue_run_job(struct drm_sched_job *sched_job)
> +{
> + struct panthor_job *job = container_of(sched_job, struct panthor_job, base);
> + struct panthor_group *group = job->group;
> + struct panthor_queue *queue = group->queues[job->queue_idx];
> + struct panthor_device *ptdev = group->ptdev;
> + struct panthor_scheduler *sched = ptdev->scheduler;
> + struct panthor_job_ringbuf_instrs instrs;
instrs isn't initialised...
> + struct panthor_job_cs_params cs_params;
> + struct dma_fence *done_fence;
> + int ret;
>
> /* Stream size is zero, nothing to do except making sure all previously
> * submitted jobs are done before we signal the
> @@ -2900,17 +3062,23 @@ queue_run_job(struct drm_sched_job *sched_job)
> queue->fence_ctx.id,
> atomic64_inc_return(&queue->fence_ctx.seqno));
>
> - memcpy(queue->ringbuf->kmap + ringbuf_insert,
> - call_instrs, sizeof(call_instrs));
> + job->profiling.slot = queue->profiling.seqno++;
> + if (queue->profiling.seqno == queue->profiling.slot_count)
> + queue->profiling.seqno = 0;
> +
> + job->ringbuf.start = queue->iface.input->insert;
> +
> + get_job_cs_params(job, &cs_params);
> + prepare_job_instrs(&cs_params, &instrs);
...but it's passed into prepare_job_instrs() which depends on
instrs.count (same bug as was in calc_job_credits()) - sorry I didn't
spot it last review.
Initializing instrs makes everything work for me.
I'm not sure quite what kernel configuration you are using but I wonder
if you've got a 'hardening' option enabled which is causing the stack to
be zero-initialised. It's worth turning it off for testing purposes ;)
Steve
> + copy_instrs_to_ringbuf(queue, job, &instrs);
> +
> + job->ringbuf.end = job->ringbuf.start + (instrs.count * sizeof(u64));
>
> panthor_job_get(&job->base);
> spin_lock(&queue->fence_ctx.lock);
> list_add_tail(&job->node, &queue->fence_ctx.in_flight_jobs);
> spin_unlock(&queue->fence_ctx.lock);
>
> - job->ringbuf.start = queue->iface.input->insert;
> - job->ringbuf.end = job->ringbuf.start + sizeof(call_instrs);
> -
> /* Make sure the ring buffer is updated before the INSERT
> * register.
> */
> @@ -3003,6 +3171,34 @@ static const struct drm_sched_backend_ops panthor_queue_sched_ops = {
> .free_job = queue_free_job,
> };
>
> +static u32 calc_profiling_ringbuf_num_slots(struct panthor_device *ptdev,
> + u32 cs_ringbuf_size)
> +{
> + u32 min_profiled_job_instrs = U32_MAX;
> + u32 last_flag = fls(PANTHOR_DEVICE_PROFILING_ALL);
> +
> + /*
> + * We want to calculate the minimum size of a profiled job's CS,
> + * because since they need additional instructions for the sampling
> + * of performance metrics, they might take up further slots in
> + * the queue's ringbuffer. This means we might not need as many job
> + * slots for keeping track of their profiling information. What we
> + * need is the maximum number of slots we should allocate to this end,
> + * which matches the maximum number of profiled jobs we can place
> + * simultaneously in the queue's ring buffer.
> + * That has to be calculated separately for every single job profiling
> + * flag, but not in the case job profiling is disabled, since unprofiled
> + * jobs don't need to keep track of this at all.
> + */
> + for (u32 i = 0; i < last_flag; i++) {
> + if (BIT(i) & PANTHOR_DEVICE_PROFILING_ALL)
> + min_profiled_job_instrs =
> + min(min_profiled_job_instrs, calc_job_credits(BIT(i)));
> + }
> +
> + return DIV_ROUND_UP(cs_ringbuf_size, min_profiled_job_instrs * sizeof(u64));
> +}
> +
> static struct panthor_queue *
> group_create_queue(struct panthor_group *group,
> const struct drm_panthor_queue_create *args)
> @@ -3056,9 +3252,35 @@ group_create_queue(struct panthor_group *group,
> goto err_free_queue;
> }
>
> + queue->profiling.slot_count =
> + calc_profiling_ringbuf_num_slots(group->ptdev, args->ringbuf_size);
> +
> + queue->profiling.slots =
> + panthor_kernel_bo_create(group->ptdev, group->vm,
> + queue->profiling.slot_count *
> + sizeof(struct panthor_job_profiling_data),
> + DRM_PANTHOR_BO_NO_MMAP,
> + DRM_PANTHOR_VM_BIND_OP_MAP_NOEXEC |
> + DRM_PANTHOR_VM_BIND_OP_MAP_UNCACHED,
> + PANTHOR_VM_KERNEL_AUTO_VA);
> +
> + if (IS_ERR(queue->profiling.slots)) {
> + ret = PTR_ERR(queue->profiling.slots);
> + goto err_free_queue;
> + }
> +
> + ret = panthor_kernel_bo_vmap(queue->profiling.slots);
> + if (ret)
> + goto err_free_queue;
> +
> + /*
> + * Credit limit argument tells us the total number of instructions
> + * across all CS slots in the ringbuffer, with some jobs requiring
> + * twice as many as others, depending on their profiling status.
> + */
> ret = drm_sched_init(&queue->scheduler, &panthor_queue_sched_ops,
> group->ptdev->scheduler->wq, 1,
> - args->ringbuf_size / (NUM_INSTRS_PER_SLOT * sizeof(u64)),
> + args->ringbuf_size / sizeof(u64),
> 0, msecs_to_jiffies(JOB_TIMEOUT_MS),
> group->ptdev->reset.wq,
> NULL, "panthor-queue", group->ptdev->base.dev);
> @@ -3354,6 +3576,7 @@ panthor_job_create(struct panthor_file *pfile,
> {
> struct panthor_group_pool *gpool = pfile->groups;
> struct panthor_job *job;
> + u32 credits;
> int ret;
>
> if (qsubmit->pad)
> @@ -3407,9 +3630,16 @@ panthor_job_create(struct panthor_file *pfile,
> }
> }
>
> + job->profiling.mask = pfile->ptdev->profile_mask;
> + credits = calc_job_credits(job->profiling.mask);
> + if (credits == 0) {
> + ret = -EINVAL;
> + goto err_put_job;
> + }
> +
> ret = drm_sched_job_init(&job->base,
> &job->group->queues[job->queue_idx]->entity,
> - 1, job->group);
> + credits, job->group);
> if (ret)
> goto err_put_job;
>
More information about the dri-devel
mailing list