[PATCH] drm/amdgpu/vcn: fix video profile race condition (v3)

Wed Aug 13 16:31:34 UTC 2025

Hi Alex,

The addition of  total_submission_cnt should work - in that
it is unlikely to have a context switch right after the begin_use().
The suggestion of moving it inside the lock (which I prefer in case someone
adds more before the lock and not reviewed thoroughly)
  - up to you to decide.

Reviewed-by: David (Ming Qiang) Wu <David.Wu3 at amd.com>

Thanks,
David
On 8/13/2025 9:45 AM, Alex Deucher wrote:
> If there are multiple instances of the VCN running,
> we may end up switching the video profile while another
> instance is active because we only take into account
> the current instance's submissions.  Look at all
> outstanding fences for the video profile.
>
> v2: drop early exit in begin_use()
> v3: handle possible race between begin_use() work handler
>
> Fixes: 3b669df92c85 ("drm/amdgpu/vcn: adjust workload profile handling")
> Reviewed-by: Sathishkumar S <sathishkumar.sundararaju at amd.com> (v1)
> Signed-off-by: Alex Deucher <alexander.deucher at amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c | 40 ++++++++++++-------------
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h |  1 +
>   2 files changed, 21 insertions(+), 20 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
> index 9a76e11d1c184..593c1ddf8819b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
> @@ -415,19 +415,25 @@ static void amdgpu_vcn_idle_work_handler(struct work_struct *work)
>   	struct amdgpu_vcn_inst *vcn_inst =
>   		container_of(work, struct amdgpu_vcn_inst, idle_work.work);
>   	struct amdgpu_device *adev = vcn_inst->adev;
> -	unsigned int fences = 0, fence[AMDGPU_MAX_VCN_INSTANCES] = {0};
> -	unsigned int i = vcn_inst->inst, j;
> +	unsigned int total_fences = 0, fence[AMDGPU_MAX_VCN_INSTANCES] = {0};
> +	unsigned int i, j;
>   	int r = 0;
>   
> -	if (adev->vcn.harvest_config & (1 << i))
> +	if (adev->vcn.harvest_config & (1 << vcn_inst->inst))
>   		return;
>   
> -	for (j = 0; j < adev->vcn.inst[i].num_enc_rings; ++j)
> -		fence[i] += amdgpu_fence_count_emitted(&vcn_inst->ring_enc[j]);
> +	for (i = 0; i < adev->vcn.num_vcn_inst; ++i) {
> +		struct amdgpu_vcn_inst *v = &adev->vcn.inst[i];
> +
> +		for (j = 0; j < v->num_enc_rings; ++j)
> +			fence[i] += amdgpu_fence_count_emitted(&v->ring_enc[j]);
> +		fence[i] += amdgpu_fence_count_emitted(&v->ring_dec);
> +		total_fences += fence[i];
> +	}
>   
>   	/* Only set DPG pause for VCN3 or below, VCN4 and above will be handled by FW */
>   	if (adev->pg_flags & AMD_PG_SUPPORT_VCN_DPG &&
> -	    !adev->vcn.inst[i].using_unified_queue) {
> +	    !vcn_inst->using_unified_queue) {
>   		struct dpg_pause_state new_state;
>   
>   		if (fence[i] ||
> @@ -436,18 +442,18 @@ static void amdgpu_vcn_idle_work_handler(struct work_struct *work)
>   		else
>   			new_state.fw_based = VCN_DPG_STATE__UNPAUSE;
>   
> -		adev->vcn.inst[i].pause_dpg_mode(vcn_inst, &new_state);
> +		vcn_inst->pause_dpg_mode(vcn_inst, &new_state);
>   	}
>   
> -	fence[i] += amdgpu_fence_count_emitted(&vcn_inst->ring_dec);
> -	fences += fence[i];
> -
> -	if (!fences && !atomic_read(&vcn_inst->total_submission_cnt)) {
> +	if (!fence[vcn_inst->inst] && !atomic_read(&vcn_inst->total_submission_cnt)) {
> +		/* This is specific to this instance */
>   		mutex_lock(&vcn_inst->vcn_pg_lock);
>   		vcn_inst->set_pg_state(vcn_inst, AMD_PG_STATE_GATE);
>   		mutex_unlock(&vcn_inst->vcn_pg_lock);
>   		mutex_lock(&adev->vcn.workload_profile_mutex);
> -		if (adev->vcn.workload_profile_active) {
> +		/* This is global and depends on all VCN instances */
> +		if (adev->vcn.workload_profile_active && !total_fences &&
> +		    !atomic_read(&adev->vcn.total_submission_cnt)) {
>   			r = amdgpu_dpm_switch_power_profile(adev, PP_SMC_POWER_PROFILE_VIDEO,
>   							    false);
>   			if (r)
> @@ -467,16 +473,10 @@ void amdgpu_vcn_ring_begin_use(struct amdgpu_ring *ring)
>   	int r = 0;
>   
>   	atomic_inc(&vcn_inst->total_submission_cnt);
> +	atomic_inc(&adev->vcn.total_submission_cnt);
move this addition down inside the mutex lock
>   	cancel_delayed_work_sync(&vcn_inst->idle_work);
>   
> -	/* We can safely return early here because we've cancelled the
> -	 * the delayed work so there is no one else to set it to false
> -	 * and we don't care if someone else sets it to true.
> -	 */
> -	if (adev->vcn.workload_profile_active)
> -		goto pg_lock;
> -
>   	mutex_lock(&adev->vcn.workload_profile_mutex);
move to here:
                atomic_inc(&adev->vcn.total_submission_cnt);
I think this should work for multiple instances.

David
>   	if (!adev->vcn.workload_profile_active) {
>   		r = amdgpu_dpm_switch_power_profile(adev, PP_SMC_POWER_PROFILE_VIDEO,
> @@ -487,7 +487,6 @@ void amdgpu_vcn_ring_begin_use(struct amdgpu_ring *ring)
>   	}
>   	mutex_unlock(&adev->vcn.workload_profile_mutex);
>   
> -pg_lock:
>   	mutex_lock(&vcn_inst->vcn_pg_lock);
>   	vcn_inst->set_pg_state(vcn_inst, AMD_PG_STATE_UNGATE);
>   
> @@ -528,6 +527,7 @@ void amdgpu_vcn_ring_end_use(struct amdgpu_ring *ring)
>   		atomic_dec(&ring->adev->vcn.inst[ring->me].dpg_enc_submission_cnt);
>   
>   	atomic_dec(&ring->adev->vcn.inst[ring->me].total_submission_cnt);
> +	atomic_dec(&ring->adev->vcn.total_submission_cnt);
>   
>   	schedule_delayed_work(&ring->adev->vcn.inst[ring->me].idle_work,
>   			      VCN_IDLE_TIMEOUT);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
> index b3fb1d0e43fc9..febc3ce8641ff 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
> @@ -352,6 +352,7 @@ struct amdgpu_vcn {
>   
>   	uint16_t inst_mask;
>   	uint8_t	num_inst_per_aid;
> +	atomic_t		total_submission_cnt;
>   
>   	/* IP reg dump */
>   	uint32_t		*ip_dump;