[PATCH 33/36] drm/amdgpu/vcn: add a helper framework for engine resets

Tue Jun 17 13:09:08 UTC 2025

On Tue, Jun 17, 2025 at 2:10 AM Sundararaju, Sathishkumar
<sasundar at amd.com> wrote:
>
> Please ignore my previous comments here, the new helper additions for
> vcn non unified queues are good.
>
> But one concern, the vinst->reset(vinst) callback must take in ring
> pointer to handle guilty/non-guilty
> for appropriate re-emit part, else the guilty ring has to be tracked
> within the ring structure or identified
> by some query with in reset.

I wasn't sure if we could handle the reemit properly on these VCN
chips.  So at least for the first iteration, I just killed all the
queues.  Is there a way to know which ring caused the hang?  How does
the VCN firmware handle the rings?

Alex

>
> Regards,
> Sathish
>
>
> On 6/17/2025 10:00 AM, Sundararaju, Sathishkumar wrote:
> > Hi Alex,
> >
> > Would it be good to have this logic in the reset call back itself ?
> >
> > Adding common vinst->reset stops the flexibility of having separate
> > reset functionality for enc rings and decode rings,
> > can selectively handle the drm_sched_wqueue_start/stop and re-emit of
> > guilty/non-guilty for enc and dec separately.
> >
> > And the usual vcn_stop() followed by vcn_start() isn't helping in
> > reset of the engine for vcn3.
> >
> > I tried a workaround to pause_dpg and enable static clockgate and
> > powergate, and then stop()/start() the engine
> > which is working consistently so far.
> >
> > Regards,
> > Sathish
> >
> > On 6/17/2025 8:38 AM, Alex Deucher wrote:
> >> With engine resets we reset all queues on the engine rather
> >> than just a single queue.  Add a framework to handle this
> >> similar to SDMA.
> >>
> >> Signed-off-by: Alex Deucher <alexander.deucher at amd.com>
> >> ---
> >>   drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c | 64 +++++++++++++++++++++++++
> >>   drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h |  6 ++-
> >>   2 files changed, 69 insertions(+), 1 deletion(-)
> >>
> >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
> >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
> >> index c8885c3d54b33..075740ed275eb 100644
> >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
> >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
> >> @@ -134,6 +134,7 @@ int amdgpu_vcn_sw_init(struct amdgpu_device
> >> *adev, int i)
> >> mutex_init(&adev->vcn.inst[i].vcn1_jpeg1_workaround);
> >>       mutex_init(&adev->vcn.inst[i].vcn_pg_lock);
> >> +    mutex_init(&adev->vcn.inst[i].engine_reset_mutex);
> >>       atomic_set(&adev->vcn.inst[i].total_submission_cnt, 0);
> >>       INIT_DELAYED_WORK(&adev->vcn.inst[i].idle_work,
> >> amdgpu_vcn_idle_work_handler);
> >> atomic_set(&adev->vcn.inst[i].dpg_enc_submission_cnt, 0);
> >> @@ -1451,3 +1452,66 @@ int vcn_set_powergating_state(struct
> >> amdgpu_ip_block *ip_block,
> >>         return ret;
> >>   }
> >> +
> >> +/**
> >> + * amdgpu_vcn_reset_engine - Reset a specific VCN engine
> >> + * @adev: Pointer to the AMDGPU device
> >> + * @instance_id: VCN engine instance to reset
> >> + *
> >> + * Returns: 0 on success, or a negative error code on failure.
> >> + */
> >> +static int amdgpu_vcn_reset_engine(struct amdgpu_device *adev,
> >> +                   uint32_t instance_id)
> >> +{
> >> +    struct amdgpu_vcn_inst *vinst = &adev->vcn.inst[instance_id];
> >> +    int r, i;
> >> +
> >> +    mutex_lock(&vinst->engine_reset_mutex);
> >> +    /* Stop the scheduler's work queue for the dec and enc rings if
> >> they are running.
> >> +     * This ensures that no new tasks are submitted to the queues while
> >> +     * the reset is in progress.
> >> +     */
> >> +    drm_sched_wqueue_stop(&vinst->ring_dec.sched);
> >> +    for (i = 0; i < vinst->num_enc_rings; i++)
> >> + drm_sched_wqueue_stop(&vinst->ring_enc[i].sched);
> >> +
> >> +    /* Perform the VCN reset for the specified instance */
> >> +    r = vinst->reset(vinst);
> >> +    if (r) {
> >> +        dev_err(adev->dev, "Failed to reset VCN instance %u\n",
> >> instance_id);
> >> +    } else {
> >> +        /* Restart the scheduler's work queue for the dec and enc rings
> >> +         * if they were stopped by this function. This allows new tasks
> >> +         * to be submitted to the queues after the reset is complete.
> >> +         */
> >> +        drm_sched_wqueue_start(&vinst->ring_dec.sched);
> >> +        for (i = 0; i < vinst->num_enc_rings; i++)
> >> + drm_sched_wqueue_start(&vinst->ring_enc[i].sched);
> >> +    }
> >> +    mutex_unlock(&vinst->engine_reset_mutex);
> >> +
> >> +    return r;
> >> +}
> >> +
> >> +/**
> >> + * amdgpu_vcn_ring_reset - Reset a VCN ring
> >> + * @ring: ring to reset
> >> + * @vmid: vmid of guilty job
> >> + * @guilty_fence: guilty fence
> >> + *
> >> + * This helper is for VCN blocks without unified queues because
> >> + * resetting the engine resets all queues in that case.  With
> >> + * unified queues we have one queue per engine.
> >> + * Returns: 0 on success, or a negative error code on failure.
> >> + */
> >> +int amdgpu_vcn_ring_reset(struct amdgpu_ring *ring,
> >> +              unsigned int vmid,
> >> +              struct amdgpu_fence *guilty_fence)
> >> +{
> >> +    struct amdgpu_device *adev = ring->adev;
> >> +
> >> +    if (adev->vcn.inst[ring->me].using_unified_queue)
> >> +        return -EINVAL;
> >> +
> >> +    return amdgpu_vcn_reset_engine(adev, ring->me);
> >> +}
> >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
> >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
> >> index 83adf81defc71..0bc0a94d7cf0f 100644
> >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
> >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
> >> @@ -330,7 +330,9 @@ struct amdgpu_vcn_inst {
> >>                     struct dpg_pause_state *new_state);
> >>       int (*set_pg_state)(struct amdgpu_vcn_inst *vinst,
> >>                   enum amd_powergating_state state);
> >> +    int (*reset)(struct amdgpu_vcn_inst *vinst);
> >>       bool using_unified_queue;
> >> +    struct mutex        engine_reset_mutex;
> >>   };
> >>     struct amdgpu_vcn_ras {
> >> @@ -552,5 +554,7 @@ void amdgpu_debugfs_vcn_sched_mask_init(struct
> >> amdgpu_device *adev);
> >>     int vcn_set_powergating_state(struct amdgpu_ip_block *ip_block,
> >>                     enum amd_powergating_state state);
> >> -
> >> +int amdgpu_vcn_ring_reset(struct amdgpu_ring *ring,
> >> +              unsigned int vmid,
> >> +              struct amdgpu_fence *guilty_fence);
> >>   #endif
> >
>