[PATCH 33/36] drm/amdgpu/vcn: add a helper framework for engine resets
Sundararaju, Sathishkumar
sasundar at amd.com
Tue Jun 17 04:30:12 UTC 2025
Hi Alex,
Would it be good to have this logic in the reset call back itself ?
Adding common vinst->reset stops the flexibility of having separate
reset functionality for enc rings and decode rings,
can selectively handle the drm_sched_wqueue_start/stop and re-emit of
guilty/non-guilty for enc and dec separately.
And the usual vcn_stop() followed by vcn_start() isn't helping in reset
of the engine for vcn3.
I tried a workaround to pause_dpg and enable static clockgate and
powergate, and then stop()/start() the engine
which is working consistently so far.
Regards,
Sathish
On 6/17/2025 8:38 AM, Alex Deucher wrote:
> With engine resets we reset all queues on the engine rather
> than just a single queue. Add a framework to handle this
> similar to SDMA.
>
> Signed-off-by: Alex Deucher <alexander.deucher at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c | 64 +++++++++++++++++++++++++
> drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h | 6 ++-
> 2 files changed, 69 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
> index c8885c3d54b33..075740ed275eb 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
> @@ -134,6 +134,7 @@ int amdgpu_vcn_sw_init(struct amdgpu_device *adev, int i)
>
> mutex_init(&adev->vcn.inst[i].vcn1_jpeg1_workaround);
> mutex_init(&adev->vcn.inst[i].vcn_pg_lock);
> + mutex_init(&adev->vcn.inst[i].engine_reset_mutex);
> atomic_set(&adev->vcn.inst[i].total_submission_cnt, 0);
> INIT_DELAYED_WORK(&adev->vcn.inst[i].idle_work, amdgpu_vcn_idle_work_handler);
> atomic_set(&adev->vcn.inst[i].dpg_enc_submission_cnt, 0);
> @@ -1451,3 +1452,66 @@ int vcn_set_powergating_state(struct amdgpu_ip_block *ip_block,
>
> return ret;
> }
> +
> +/**
> + * amdgpu_vcn_reset_engine - Reset a specific VCN engine
> + * @adev: Pointer to the AMDGPU device
> + * @instance_id: VCN engine instance to reset
> + *
> + * Returns: 0 on success, or a negative error code on failure.
> + */
> +static int amdgpu_vcn_reset_engine(struct amdgpu_device *adev,
> + uint32_t instance_id)
> +{
> + struct amdgpu_vcn_inst *vinst = &adev->vcn.inst[instance_id];
> + int r, i;
> +
> + mutex_lock(&vinst->engine_reset_mutex);
> + /* Stop the scheduler's work queue for the dec and enc rings if they are running.
> + * This ensures that no new tasks are submitted to the queues while
> + * the reset is in progress.
> + */
> + drm_sched_wqueue_stop(&vinst->ring_dec.sched);
> + for (i = 0; i < vinst->num_enc_rings; i++)
> + drm_sched_wqueue_stop(&vinst->ring_enc[i].sched);
> +
> + /* Perform the VCN reset for the specified instance */
> + r = vinst->reset(vinst);
> + if (r) {
> + dev_err(adev->dev, "Failed to reset VCN instance %u\n", instance_id);
> + } else {
> + /* Restart the scheduler's work queue for the dec and enc rings
> + * if they were stopped by this function. This allows new tasks
> + * to be submitted to the queues after the reset is complete.
> + */
> + drm_sched_wqueue_start(&vinst->ring_dec.sched);
> + for (i = 0; i < vinst->num_enc_rings; i++)
> + drm_sched_wqueue_start(&vinst->ring_enc[i].sched);
> + }
> + mutex_unlock(&vinst->engine_reset_mutex);
> +
> + return r;
> +}
> +
> +/**
> + * amdgpu_vcn_ring_reset - Reset a VCN ring
> + * @ring: ring to reset
> + * @vmid: vmid of guilty job
> + * @guilty_fence: guilty fence
> + *
> + * This helper is for VCN blocks without unified queues because
> + * resetting the engine resets all queues in that case. With
> + * unified queues we have one queue per engine.
> + * Returns: 0 on success, or a negative error code on failure.
> + */
> +int amdgpu_vcn_ring_reset(struct amdgpu_ring *ring,
> + unsigned int vmid,
> + struct amdgpu_fence *guilty_fence)
> +{
> + struct amdgpu_device *adev = ring->adev;
> +
> + if (adev->vcn.inst[ring->me].using_unified_queue)
> + return -EINVAL;
> +
> + return amdgpu_vcn_reset_engine(adev, ring->me);
> +}
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
> index 83adf81defc71..0bc0a94d7cf0f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
> @@ -330,7 +330,9 @@ struct amdgpu_vcn_inst {
> struct dpg_pause_state *new_state);
> int (*set_pg_state)(struct amdgpu_vcn_inst *vinst,
> enum amd_powergating_state state);
> + int (*reset)(struct amdgpu_vcn_inst *vinst);
> bool using_unified_queue;
> + struct mutex engine_reset_mutex;
> };
>
> struct amdgpu_vcn_ras {
> @@ -552,5 +554,7 @@ void amdgpu_debugfs_vcn_sched_mask_init(struct amdgpu_device *adev);
>
> int vcn_set_powergating_state(struct amdgpu_ip_block *ip_block,
> enum amd_powergating_state state);
> -
> +int amdgpu_vcn_ring_reset(struct amdgpu_ring *ring,
> + unsigned int vmid,
> + struct amdgpu_fence *guilty_fence);
> #endif
More information about the amd-gfx
mailing list