[PATCH] drm/amdgpu/vcn: fix video profile race condition (v3)

Wed Aug 13 18:16:41 UTC 2025

On 8/13/2025 12:51 PM, Alex Deucher wrote:
> On Wed, Aug 13, 2025 at 12:39 PM Wu, David <davidwu2 at amd.com> wrote:
>> Hi Alex,
>>
>> The addition of  total_submission_cnt should work - in that
>> it is unlikely to have a context switch right after the begin_use().
>> The suggestion of moving it inside the lock (which I prefer in case someone
>> adds more before the lock and not reviewed thoroughly)
>>    - up to you to decide.
>>
>> Reviewed-by: David (Ming Qiang) Wu <David.Wu3 at amd.com>
>>
>> Thanks,
>> David
>> On 8/13/2025 9:45 AM, Alex Deucher wrote:
>>> If there are multiple instances of the VCN running,
>>> we may end up switching the video profile while another
>>> instance is active because we only take into account
>>> the current instance's submissions.  Look at all
>>> outstanding fences for the video profile.
>>>
>>> v2: drop early exit in begin_use()
>>> v3: handle possible race between begin_use() work handler
>>>
>>> Fixes: 3b669df92c85 ("drm/amdgpu/vcn: adjust workload profile handling")
>>> Reviewed-by: Sathishkumar S <sathishkumar.sundararaju at amd.com> (v1)
>>> Signed-off-by: Alex Deucher <alexander.deucher at amd.com>
>>> ---
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c | 40 ++++++++++++-------------
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h |  1 +
>>>    2 files changed, 21 insertions(+), 20 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
>>> index 9a76e11d1c184..593c1ddf8819b 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
>>> @@ -415,19 +415,25 @@ static void amdgpu_vcn_idle_work_handler(struct work_struct *work)
>>>        struct amdgpu_vcn_inst *vcn_inst =
>>>                container_of(work, struct amdgpu_vcn_inst, idle_work.work);
>>>        struct amdgpu_device *adev = vcn_inst->adev;
>>> -     unsigned int fences = 0, fence[AMDGPU_MAX_VCN_INSTANCES] = {0};
>>> -     unsigned int i = vcn_inst->inst, j;
>>> +     unsigned int total_fences = 0, fence[AMDGPU_MAX_VCN_INSTANCES] = {0};
>>> +     unsigned int i, j;
>>>        int r = 0;
>>>
>>> -     if (adev->vcn.harvest_config & (1 << i))
>>> +     if (adev->vcn.harvest_config & (1 << vcn_inst->inst))
>>>                return;
>>>
>>> -     for (j = 0; j < adev->vcn.inst[i].num_enc_rings; ++j)
>>> -             fence[i] += amdgpu_fence_count_emitted(&vcn_inst->ring_enc[j]);
>>> +     for (i = 0; i < adev->vcn.num_vcn_inst; ++i) {
>>> +             struct amdgpu_vcn_inst *v = &adev->vcn.inst[i];
>>> +
>>> +             for (j = 0; j < v->num_enc_rings; ++j)
>>> +                     fence[i] += amdgpu_fence_count_emitted(&v->ring_enc[j]);
>>> +             fence[i] += amdgpu_fence_count_emitted(&v->ring_dec);
>>> +             total_fences += fence[i];
>>> +     }
>>>
>>>        /* Only set DPG pause for VCN3 or below, VCN4 and above will be handled by FW */
>>>        if (adev->pg_flags & AMD_PG_SUPPORT_VCN_DPG &&
>>> -         !adev->vcn.inst[i].using_unified_queue) {
>>> +         !vcn_inst->using_unified_queue) {
>>>                struct dpg_pause_state new_state;
>>>
>>>                if (fence[i] ||
>>> @@ -436,18 +442,18 @@ static void amdgpu_vcn_idle_work_handler(struct work_struct *work)
>>>                else
>>>                        new_state.fw_based = VCN_DPG_STATE__UNPAUSE;
>>>
>>> -             adev->vcn.inst[i].pause_dpg_mode(vcn_inst, &new_state);
>>> +             vcn_inst->pause_dpg_mode(vcn_inst, &new_state);
>>>        }
>>>
>>> -     fence[i] += amdgpu_fence_count_emitted(&vcn_inst->ring_dec);
>>> -     fences += fence[i];
>>> -
>>> -     if (!fences && !atomic_read(&vcn_inst->total_submission_cnt)) {
>>> +     if (!fence[vcn_inst->inst] && !atomic_read(&vcn_inst->total_submission_cnt)) {
>>> +             /* This is specific to this instance */
>>>                mutex_lock(&vcn_inst->vcn_pg_lock);
>>>                vcn_inst->set_pg_state(vcn_inst, AMD_PG_STATE_GATE);
>>>                mutex_unlock(&vcn_inst->vcn_pg_lock);
>>>                mutex_lock(&adev->vcn.workload_profile_mutex);
>>> -             if (adev->vcn.workload_profile_active) {
>>> +             /* This is global and depends on all VCN instances */
>>> +             if (adev->vcn.workload_profile_active && !total_fences &&
>>> +                 !atomic_read(&adev->vcn.total_submission_cnt)) {
>>>                        r = amdgpu_dpm_switch_power_profile(adev, PP_SMC_POWER_PROFILE_VIDEO,
>>>                                                            false);
>>>                        if (r)
>>> @@ -467,16 +473,10 @@ void amdgpu_vcn_ring_begin_use(struct amdgpu_ring *ring)
>>>        int r = 0;
>>>
>>>        atomic_inc(&vcn_inst->total_submission_cnt);
>>> +     atomic_inc(&adev->vcn.total_submission_cnt);
>> move this addition down inside the mutex lock
>>>        cancel_delayed_work_sync(&vcn_inst->idle_work);
>>>
>>> -     /* We can safely return early here because we've cancelled the
>>> -      * the delayed work so there is no one else to set it to false
>>> -      * and we don't care if someone else sets it to true.
>>> -      */
>>> -     if (adev->vcn.workload_profile_active)
>>> -             goto pg_lock;
>>> -
>>>        mutex_lock(&adev->vcn.workload_profile_mutex);
>> move to here:
>>                  atomic_inc(&adev->vcn.total_submission_cnt);
>> I think this should work for multiple instances.
> Why does this need to be protected by the mutex?
hmm.. OK - no need and it is actually better before the mutex.
David
> Alex
>
>> David
>>>        if (!adev->vcn.workload_profile_active) {
>>>                r = amdgpu_dpm_switch_power_profile(adev, PP_SMC_POWER_PROFILE_VIDEO,
>>> @@ -487,7 +487,6 @@ void amdgpu_vcn_ring_begin_use(struct amdgpu_ring *ring)
>>>        }
>>>        mutex_unlock(&adev->vcn.workload_profile_mutex);
>>>
>>> -pg_lock:
>>>        mutex_lock(&vcn_inst->vcn_pg_lock);
>>>        vcn_inst->set_pg_state(vcn_inst, AMD_PG_STATE_UNGATE);
>>>
>>> @@ -528,6 +527,7 @@ void amdgpu_vcn_ring_end_use(struct amdgpu_ring *ring)
>>>                atomic_dec(&ring->adev->vcn.inst[ring->me].dpg_enc_submission_cnt);
>>>
>>>        atomic_dec(&ring->adev->vcn.inst[ring->me].total_submission_cnt);
>>> +     atomic_dec(&ring->adev->vcn.total_submission_cnt);
>>>
>>>        schedule_delayed_work(&ring->adev->vcn.inst[ring->me].idle_work,
>>>                              VCN_IDLE_TIMEOUT);
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
>>> index b3fb1d0e43fc9..febc3ce8641ff 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
>>> @@ -352,6 +352,7 @@ struct amdgpu_vcn {
>>>
>>>        uint16_t inst_mask;
>>>        uint8_t num_inst_per_aid;
>>> +     atomic_t                total_submission_cnt;
>>>
>>>        /* IP reg dump */
>>>        uint32_t                *ip_dump;