[PATCH 33/36] drm/amdgpu/vcn: add a helper framework for engine resets

Tue Jun 17 04:30:12 UTC 2025

Hi Alex,

Would it be good to have this logic in the reset call back itself ?

Adding common vinst->reset stops the flexibility of having separate 
reset functionality for enc rings and decode rings,
can selectively handle the drm_sched_wqueue_start/stop and re-emit of 
guilty/non-guilty for enc and dec separately.

And the usual vcn_stop() followed by vcn_start() isn't helping in reset 
of the engine for vcn3.

I tried a workaround to pause_dpg and enable static clockgate and 
powergate, and then stop()/start() the engine
which is working consistently so far.

Regards,
Sathish

On 6/17/2025 8:38 AM, Alex Deucher wrote:
> With engine resets we reset all queues on the engine rather
> than just a single queue.  Add a framework to handle this
> similar to SDMA.
>
> Signed-off-by: Alex Deucher <alexander.deucher at amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c | 64 +++++++++++++++++++++++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h |  6 ++-
>   2 files changed, 69 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
> index c8885c3d54b33..075740ed275eb 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
> @@ -134,6 +134,7 @@ int amdgpu_vcn_sw_init(struct amdgpu_device *adev, int i)
>   
>   	mutex_init(&adev->vcn.inst[i].vcn1_jpeg1_workaround);
>   	mutex_init(&adev->vcn.inst[i].vcn_pg_lock);
> +	mutex_init(&adev->vcn.inst[i].engine_reset_mutex);
>   	atomic_set(&adev->vcn.inst[i].total_submission_cnt, 0);
>   	INIT_DELAYED_WORK(&adev->vcn.inst[i].idle_work, amdgpu_vcn_idle_work_handler);
>   	atomic_set(&adev->vcn.inst[i].dpg_enc_submission_cnt, 0);
> @@ -1451,3 +1452,66 @@ int vcn_set_powergating_state(struct amdgpu_ip_block *ip_block,
>   
>   	return ret;
>   }
> +
> +/**
> + * amdgpu_vcn_reset_engine - Reset a specific VCN engine
> + * @adev: Pointer to the AMDGPU device
> + * @instance_id: VCN engine instance to reset
> + *
> + * Returns: 0 on success, or a negative error code on failure.
> + */
> +static int amdgpu_vcn_reset_engine(struct amdgpu_device *adev,
> +				   uint32_t instance_id)
> +{
> +	struct amdgpu_vcn_inst *vinst = &adev->vcn.inst[instance_id];
> +	int r, i;
> +
> +	mutex_lock(&vinst->engine_reset_mutex);
> +	/* Stop the scheduler's work queue for the dec and enc rings if they are running.
> +	 * This ensures that no new tasks are submitted to the queues while
> +	 * the reset is in progress.
> +	 */
> +	drm_sched_wqueue_stop(&vinst->ring_dec.sched);
> +	for (i = 0; i < vinst->num_enc_rings; i++)
> +		drm_sched_wqueue_stop(&vinst->ring_enc[i].sched);
> +
> +	/* Perform the VCN reset for the specified instance */
> +	r = vinst->reset(vinst);
> +	if (r) {
> +		dev_err(adev->dev, "Failed to reset VCN instance %u\n", instance_id);
> +	} else {
> +		/* Restart the scheduler's work queue for the dec and enc rings
> +		 * if they were stopped by this function. This allows new tasks
> +		 * to be submitted to the queues after the reset is complete.
> +		 */
> +		drm_sched_wqueue_start(&vinst->ring_dec.sched);
> +		for (i = 0; i < vinst->num_enc_rings; i++)
> +			drm_sched_wqueue_start(&vinst->ring_enc[i].sched);
> +	}
> +	mutex_unlock(&vinst->engine_reset_mutex);
> +
> +	return r;
> +}
> +
> +/**
> + * amdgpu_vcn_ring_reset - Reset a VCN ring
> + * @ring: ring to reset
> + * @vmid: vmid of guilty job
> + * @guilty_fence: guilty fence
> + *
> + * This helper is for VCN blocks without unified queues because
> + * resetting the engine resets all queues in that case.  With
> + * unified queues we have one queue per engine.
> + * Returns: 0 on success, or a negative error code on failure.
> + */
> +int amdgpu_vcn_ring_reset(struct amdgpu_ring *ring,
> +			  unsigned int vmid,
> +			  struct amdgpu_fence *guilty_fence)
> +{
> +	struct amdgpu_device *adev = ring->adev;
> +
> +	if (adev->vcn.inst[ring->me].using_unified_queue)
> +		return -EINVAL;
> +
> +	return amdgpu_vcn_reset_engine(adev, ring->me);
> +}
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
> index 83adf81defc71..0bc0a94d7cf0f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
> @@ -330,7 +330,9 @@ struct amdgpu_vcn_inst {
>   			      struct dpg_pause_state *new_state);
>   	int (*set_pg_state)(struct amdgpu_vcn_inst *vinst,
>   			    enum amd_powergating_state state);
> +	int (*reset)(struct amdgpu_vcn_inst *vinst);
>   	bool using_unified_queue;
> +	struct mutex		engine_reset_mutex;
>   };
>   
>   struct amdgpu_vcn_ras {
> @@ -552,5 +554,7 @@ void amdgpu_debugfs_vcn_sched_mask_init(struct amdgpu_device *adev);
>   
>   int vcn_set_powergating_state(struct amdgpu_ip_block *ip_block,
>   			      enum amd_powergating_state state);
> -
> +int amdgpu_vcn_ring_reset(struct amdgpu_ring *ring,
> +			  unsigned int vmid,
> +			  struct amdgpu_fence *guilty_fence);
>   #endif