[PATCH v7 04/10] accel/rocket: Add job submission IOCTL

Fri Jul 11 16:00:54 UTC 2025

On Tue, Jun 24, 2025 at 3:50 PM Robin Murphy <robin.murphy at arm.com> wrote:
>
> On 2025-06-06 7:28 am, Tomeu Vizoso wrote:
> [...]
> > diff --git a/drivers/accel/rocket/rocket_device.h b/drivers/accel/rocket/rocket_device.h
> > index 10acfe8534f00a7985d40a93f4b2f7f69d43caee..50e46f0516bd1615b5f826c5002a6c0ecbf9aed4 100644
> > --- a/drivers/accel/rocket/rocket_device.h
> > +++ b/drivers/accel/rocket/rocket_device.h
> > @@ -13,6 +13,8 @@
> >   struct rocket_device {
> >       struct drm_device ddev;
> >
> > +     struct mutex sched_lock;
> > +
> >       struct mutex iommu_lock;
>
> Just realised I missed this in the last patch, but iommu_lock appears to
> be completely unnecessary now.
>
> >       struct rocket_core *cores;
> [...]
> > +static void rocket_job_hw_submit(struct rocket_core *core, struct rocket_job *job)
> > +{
> > +     struct rocket_task *task;
> > +     bool task_pp_en = 1;
> > +     bool task_count = 1;
> > +
> > +     /* GO ! */
> > +
> > +     /* Don't queue the job if a reset is in progress */
> > +     if (atomic_read(&core->reset.pending))
> > +             return;
> > +
> > +     task = &job->tasks[job->next_task_idx];
> > +     job->next_task_idx++;
> > +
> > +     rocket_pc_writel(core, BASE_ADDRESS, 0x1);
> > +
> > +     rocket_cna_writel(core, S_POINTER, 0xe + 0x10000000 * core->index);
> > +     rocket_core_writel(core, S_POINTER, 0xe + 0x10000000 * core->index);
>
> Those really look like bitfield operations rather than actual arithmetic
> to me.
>
> > +
> > +     rocket_pc_writel(core, BASE_ADDRESS, task->regcmd);
>
> I don't see how regcmd is created (I guess that's in userspace?), but
> given that it's explicitly u64 all the way through - and especially
> since you claim to support 40-bit DMA addresses - it definitely seems
> suspicious that the upper 32 bits never seem to be consumed anywhere :/

Yeah, but there's no other register for BASE_ADDRESS address in the TRM.

> > +     rocket_pc_writel(core, REGISTER_AMOUNTS, (task->regcmd_count + 1) / 2 - 1);
> > +
> > +     rocket_pc_writel(core, INTERRUPT_MASK, PC_INTERRUPT_MASK_DPU_0 | PC_INTERRUPT_MASK_DPU_1);
> > +     rocket_pc_writel(core, INTERRUPT_CLEAR, PC_INTERRUPT_CLEAR_DPU_0 | PC_INTERRUPT_CLEAR_DPU_1);
> > +
> > +     rocket_pc_writel(core, TASK_CON, ((0x6 | task_pp_en) << 12) | task_count);
> > +
> > +     rocket_pc_writel(core, TASK_DMA_BASE_ADDR, 0x0);
> > +
> > +     rocket_pc_writel(core, OPERATION_ENABLE, 0x1);
> > +
> > +     dev_dbg(core->dev, "Submitted regcmd at 0x%llx to core %d", task->regcmd, core->index);
> > +}
> [...]
> > +static struct dma_fence *rocket_job_run(struct drm_sched_job *sched_job)
> > +{
> > +     struct rocket_job *job = to_rocket_job(sched_job);
> > +     struct rocket_device *rdev = job->rdev;
> > +     struct rocket_core *core = sched_to_core(rdev, sched_job->sched);
> > +     struct dma_fence *fence = NULL;
> > +     int ret;
> > +
> > +     if (unlikely(job->base.s_fence->finished.error))
> > +             return NULL;
> > +
> > +     /*
> > +      * Nothing to execute: can happen if the job has finished while
> > +      * we were resetting the GPU.
>
> GPU? (Similarly in various other comments/prints)
>
> > +      */
> > +     if (job->next_task_idx == job->task_count)
> > +             return NULL;
> > +
> > +     fence = rocket_fence_create(core);
> > +     if (IS_ERR(fence))
> > +             return fence;
> > +
> > +     if (job->done_fence)
> > +             dma_fence_put(job->done_fence);
> > +     job->done_fence = dma_fence_get(fence);
> > +
> > +     ret = pm_runtime_get_sync(core->dev);
> > +     if (ret < 0)
> > +             return fence;
> > +
> > +     ret = iommu_attach_group(job->domain, iommu_group_get(core->dev));
>
> I don't see iommu_group_put() anywhere, so you're leaking refcounts all
> over.
>
> > +     if (ret < 0)
> > +             return fence;
> > +
> > +     scoped_guard(spinlock, &core->job_lock) {
> > +             core->in_flight_job = job;
> > +             rocket_job_hw_submit(core, job);
> > +     }
> > +
> > +     return fence;
> > +}
> [...]
> > +static void rocket_job_handle_irq(struct rocket_core *core)
> > +{
> > +     u32 status, raw_status;
> > +
> > +     pm_runtime_mark_last_busy(core->dev);
> > +
> > +     status = rocket_pc_readl(core, INTERRUPT_STATUS);
> > +     raw_status = rocket_pc_readl(core, INTERRUPT_RAW_STATUS);
> > +
> > +     rocket_pc_writel(core, OPERATION_ENABLE, 0x0);
> > +     rocket_pc_writel(core, INTERRUPT_CLEAR, 0x1ffff);
>
> What was the point of reading the status registers if we're just going
> to blindly clear every possible condition anyway?
>
> > +     scoped_guard(spinlock, &core->job_lock)
> > +             if (core->in_flight_job)
> > +                     rocket_job_handle_done(core, core->in_flight_job);
>
> But then is it really OK to just start the next task regardless of
> whether the current task was reporting successful completion or an error?
>
> > +}
> > +
> > +static void
> > +rocket_reset(struct rocket_core *core, struct drm_sched_job *bad)
> > +{
> > +     bool cookie;
> > +
> > +     if (!atomic_read(&core->reset.pending))
> > +             return;
> > +
> > +     /*
> > +      * Stop the scheduler.
> > +      *
> > +      * FIXME: We temporarily get out of the dma_fence_signalling section
> > +      * because the cleanup path generate lockdep splats when taking locks
> > +      * to release job resources. We should rework the code to follow this
> > +      * pattern:
> > +      *
> > +      *      try_lock
> > +      *      if (locked)
> > +      *              release
> > +      *      else
> > +      *              schedule_work_to_release_later
> > +      */
> > +     drm_sched_stop(&core->sched, bad);
> > +
> > +     cookie = dma_fence_begin_signalling();
> > +
> > +     if (bad)
> > +             drm_sched_increase_karma(bad);
> > +
> > +     /*
> > +      * Mask job interrupts and synchronize to make sure we won't be
> > +      * interrupted during our reset.
> > +      */
> > +     rocket_pc_writel(core, INTERRUPT_MASK, 0x0);
> > +     synchronize_irq(core->irq);
>
> ...except it's a shared IRQ, so it can still merrily fire at any time.
>
> > +
> > +     /* Handle the remaining interrupts before we reset. */
> > +     rocket_job_handle_irq(core);
> > +
> > +     /*
> > +      * Remaining interrupts have been handled, but we might still have
> > +      * stuck jobs. Let's make sure the PM counters stay balanced by
> > +      * manually calling pm_runtime_put_noidle() and
> > +      * rocket_devfreq_record_idle() for each stuck job.
> > +      * Let's also make sure the cycle counting register's refcnt is
> > +      * kept balanced to prevent it from running forever
>
> Comments that don't match the code are more confusing than helpful :/
>
> > +      */
> > +     scoped_guard(spinlock, &core->job_lock) {
> > +             if (core->in_flight_job)
> > +                     pm_runtime_put_noidle(core->dev);
> > +
> > +             core->in_flight_job = NULL;
> > +     }
> > +
> > +     /* Proceed with reset now. */
> > +     pm_runtime_force_suspend(core->dev);
> > +     pm_runtime_force_resume(core->dev);
>
> Can you guarantee that actually resets the hardware if something else is
> holding the power domain open or RPM is disabled? I'm not familiar with
> the details of drm_sched, but if there are other jobs queued behind the
> stuck one would it even pass the rocket_job_is_idle() check for suspend
> to succeed anyway?
>
> Not to mention that you have an actual reset control in the DT binding,
> which isn't even optional... :/

Yeah, I think in my testing the resets didn't work because we didn't
have the right resources in the DT nodes. Have retested and the
documented reset process is enough to reset each core.

Will be sending soon a revised version.

Thanks,

Tomeu

> > +     /* GPU has been reset, we can clear the reset pending bit. */
> > +     atomic_set(&core->reset.pending, 0);
> > +
> > +     /*
> > +      * Now resubmit jobs that were previously queued but didn't have a
> > +      * chance to finish.
> > +      * FIXME: We temporarily get out of the DMA fence signalling section
> > +      * while resubmitting jobs because the job submission logic will
> > +      * allocate memory with the GFP_KERNEL flag which can trigger memory
> > +      * reclaim and exposes a lock ordering issue.
> > +      */
> > +     dma_fence_end_signalling(cookie);
> > +     drm_sched_resubmit_jobs(&core->sched);
>
> Since I happened to look, this says it's deprecated?
>
> > +     cookie = dma_fence_begin_signalling();
> > +
> > +     /* Restart the scheduler */
> > +     drm_sched_start(&core->sched, 0);
> > +
> > +     dma_fence_end_signalling(cookie);
> > +}
> > +
> > +static enum drm_gpu_sched_stat rocket_job_timedout(struct drm_sched_job *sched_job)
> > +{
> > +     struct rocket_job *job = to_rocket_job(sched_job);
> > +     struct rocket_device *rdev = job->rdev;
> > +     struct rocket_core *core = sched_to_core(rdev, sched_job->sched);
> > +
> > +     /*
> > +      * If the GPU managed to complete this jobs fence, the timeout is
> > +      * spurious. Bail out.
> > +      */
> > +     if (dma_fence_is_signaled(job->done_fence))
> > +             return DRM_GPU_SCHED_STAT_NOMINAL;
>
> Do we really need the same return condition twice? What if the IRQ fires
> immediately after we've made this check, and is handled without delay
> such that sychronize_irq() effectively still does nothing? Either way
> we've taken longer than the timeout value to observe the job completing
> successfully, and either that's significant and worth warning about or
> it's not - I don't see any point in trying to (inaccurately) nitpick
> *why* it might have happened.
>
> > +     /*
> > +      * Rocket IRQ handler may take a long time to process an interrupt
> > +      * if there is another IRQ handler hogging the processing.
> > +      * For example, the HDMI encoder driver might be stuck in the IRQ
> > +      * handler for a significant time in a case of bad cable connection.
>
> What have HDMI cables got to do with anything here? Yes, in general IRQ
> latency can be high, since CPUs can have IRQs masked and/or be taking
> higher-priority interrupts for any number of reasons. I don't see how an
> oddly-specific example (of apparently poor driver design, to boot) is
> useful.
>
> > +      * In order to catch such cases and not report spurious rocket
> > +      * job timeouts, synchronize the IRQ handler and re-check the fence
> > +      * status.
> > +      */
> > +     synchronize_irq(core->irq);
> > +
> > +     if (dma_fence_is_signaled(job->done_fence)) {
> > +             dev_warn(core->dev, "unexpectedly high interrupt latency\n");
> > +             return DRM_GPU_SCHED_STAT_NOMINAL;
> > +     }
> > +
> > +     dev_err(core->dev, "gpu sched timeout");
> > +
> > +     atomic_set(&core->reset.pending, 1);
> > +     rocket_reset(core, sched_job);
> > +     iommu_detach_group(NULL, iommu_group_get(core->dev));
> > +
> > +     return DRM_GPU_SCHED_STAT_NOMINAL;
> > +}
> > +
> > +static void rocket_reset_work(struct work_struct *work)
> > +{
> > +     struct rocket_core *core;
> > +
> > +     core = container_of(work, struct rocket_core, reset.work);
> > +     rocket_reset(core, NULL);
> > +}
> > +
> > +static const struct drm_sched_backend_ops rocket_sched_ops = {
> > +     .run_job = rocket_job_run,
> > +     .timedout_job = rocket_job_timedout,
> > +     .free_job = rocket_job_free
> > +};
> > +
> > +static irqreturn_t rocket_job_irq_handler_thread(int irq, void *data)
> > +{
> > +     struct rocket_core *core = data;
> > +
> > +     rocket_job_handle_irq(core);
> > +
> > +     return IRQ_HANDLED;
> > +}
> > +
> > +static irqreturn_t rocket_job_irq_handler(int irq, void *data)
> > +{
> > +     struct rocket_core *core = data;
> > +     u32 raw_status = rocket_pc_readl(core, INTERRUPT_RAW_STATUS);
>
> Given that this can be a shared IRQ as above, it would be a good idea to
> take care to avoid register accesses while suspended. Especially if
> you're trying to utilise suspend to reset a failing job that may well be
> throwing IOMMU faults.
>
> > +
> > +     WARN_ON(raw_status & PC_INTERRUPT_RAW_STATUS_DMA_READ_ERROR);
> > +     WARN_ON(raw_status & PC_INTERRUPT_RAW_STATUS_DMA_READ_ERROR);
> > +
> > +     if (!(raw_status & PC_INTERRUPT_RAW_STATUS_DPU_0 ||
> > +           raw_status & PC_INTERRUPT_RAW_STATUS_DPU_1))
> > +             return IRQ_NONE;
> > +
> > +     rocket_pc_writel(core, INTERRUPT_MASK, 0x0);
> > +
> > +     return IRQ_WAKE_THREAD;
> > +}
> > +
> > +int rocket_job_init(struct rocket_core *core)
> > +{
> > +     struct drm_sched_init_args args = {
> > +             .ops = &rocket_sched_ops,
> > +             .num_rqs = DRM_SCHED_PRIORITY_COUNT,
> > +             .credit_limit = 1,
>
> Ah, does this mean that all the stuff about queued jobs was in fact all
> nonsense anyway?
>
> > +             .timeout = msecs_to_jiffies(JOB_TIMEOUT_MS),
> > +             .name = dev_name(core->dev),
> > +             .dev = core->dev,
> > +     };
> > +     int ret;
> > +
> > +     INIT_WORK(&core->reset.work, rocket_reset_work);
> > +     spin_lock_init(&core->job_lock);
> > +
> > +     core->irq = platform_get_irq(to_platform_device(core->dev), 0);
> > +     if (core->irq < 0)
> > +             return core->irq;
> > +
> > +     ret = devm_request_threaded_irq(core->dev, core->irq,
> > +                                     rocket_job_irq_handler,
> > +                                     rocket_job_irq_handler_thread,
> > +                                     IRQF_SHARED, KBUILD_MODNAME "-job",
>
> Is it really a "job" interrupt though? The binding and the register
> definitions suggest it's just a general status interrupt for the core.
> Furthermore since we expect to have multiple cores, being able to more
> easily identify and attribute per-core IRQ activity seems more useful
> for debugging than copy-pasting from something really rather different
> which also expects to be the only one of its kind on the system.
>
> Thanks,
> Robin.
>
> > +                                     core);
> > +     if (ret) {
> > +             dev_err(core->dev, "failed to request job irq");
> > +             return ret;
> > +     }