[PATCH] drm/amdgpu/vcn: Fix race condition during dpm off

Wed Aug 13 21:57:03 UTC 2025

On Wed, Aug 13, 2025 at 5:09 PM Sathishkumar S
<sathishkumar.sundararaju at amd.com> wrote:
>
> There is a race condition which leads to dpm video power profile
> OFF during active video decode on multi-instance VCN hardware.
> Add flags to track ON/OFF vcn instances and check if all
> instances are off before turning OFF video power profile.
>
> v2:
> using per instance lock is wrong, use a global lock (David Wu)
>
> Fixes: 3b669df92c85 ("drm/amdgpu/vcn: adjust workload profile handling")
> Signed-off-by: Sathishkumar S <sathishkumar.sundararaju at amd.com>
> Signed-off-by: Alex Deucher <alexander.deucher at amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  1 -
>  drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c    | 31 +++++++++-------------
>  drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h    |  7 +++--
>  drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c      | 18 +++----------
>  4 files changed, 21 insertions(+), 36 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index f7d7e4748197..69a889b3222a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -4444,7 +4444,6 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>         }
>         mutex_init(&adev->gfx.userq_sch_mutex);
>         mutex_init(&adev->gfx.workload_profile_mutex);
> -       mutex_init(&adev->vcn.workload_profile_mutex);

We already have the global vcn.workload_profile_mutex.  Why not just
use that?  It better describes what it is protecting.

>         mutex_init(&adev->userq_mutex);
>
>         amdgpu_device_init_apu_flags(adev);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
> index 9a76e11d1c18..e034baa77977 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
> @@ -102,6 +102,7 @@ int amdgpu_vcn_early_init(struct amdgpu_device *adev, int i)
>         adev->vcn.inst[i].adev = adev;
>         adev->vcn.inst[i].inst = i;
>         amdgpu_ucode_ip_version_decode(adev, UVD_HWIP, ucode_prefix, sizeof(ucode_prefix));
> +       mutex_init(&adev->vcn.lock);

This will get initialized multiple times as this function is called
per instance.  Better to initialize it in amdgpu_device_init().

>
>         if (i != 0 && adev->vcn.per_inst_fw) {
>                 r = amdgpu_ucode_request(adev, &adev->vcn.inst[i].fw,
> @@ -134,7 +135,6 @@ int amdgpu_vcn_sw_init(struct amdgpu_device *adev, int i)
>         int r;
>
>         mutex_init(&adev->vcn.inst[i].vcn1_jpeg1_workaround);
> -       mutex_init(&adev->vcn.inst[i].vcn_pg_lock);

You can probably drop this lock as a separate patch.  I don't see any
concurrent access where we would need it.

>         mutex_init(&adev->vcn.inst[i].engine_reset_mutex);
>         atomic_set(&adev->vcn.inst[i].total_submission_cnt, 0);
>         INIT_DELAYED_WORK(&adev->vcn.inst[i].idle_work, amdgpu_vcn_idle_work_handler);
> @@ -290,7 +290,6 @@ int amdgpu_vcn_sw_fini(struct amdgpu_device *adev, int i)
>         if (adev->vcn.reg_list)
>                 amdgpu_vcn_reg_dump_fini(adev);
>
> -       mutex_destroy(&adev->vcn.inst[i].vcn_pg_lock);
>         mutex_destroy(&adev->vcn.inst[i].vcn1_jpeg1_workaround);
>
>         return 0;
> @@ -443,18 +442,18 @@ static void amdgpu_vcn_idle_work_handler(struct work_struct *work)
>         fences += fence[i];
>
>         if (!fences && !atomic_read(&vcn_inst->total_submission_cnt)) {
> -               mutex_lock(&vcn_inst->vcn_pg_lock);
> +               mutex_lock(&adev->vcn.lock);
>                 vcn_inst->set_pg_state(vcn_inst, AMD_PG_STATE_GATE);
> -               mutex_unlock(&vcn_inst->vcn_pg_lock);
> -               mutex_lock(&adev->vcn.workload_profile_mutex);
> -               if (adev->vcn.workload_profile_active) {
> +               adev->vcn.flags &= AMDGPU_VCN_FLAG_VINST_OFF(vcn_inst->inst);
> +               if (adev->vcn.workload_profile_active &&
> +                   !(adev->vcn.flags & AMDGPU_VCN_FLAG_VINST_MASK(adev->vcn.num_vcn_inst))) {

I still don't see how looking at the global emitted fences here would
race, but I don't have a strong preference on how we solve this.

Alex

>                         r = amdgpu_dpm_switch_power_profile(adev, PP_SMC_POWER_PROFILE_VIDEO,
>                                                             false);
>                         if (r)
>                                 dev_warn(adev->dev, "(%d) failed to disable video power profile mode\n", r);
>                         adev->vcn.workload_profile_active = false;
>                 }
> -               mutex_unlock(&adev->vcn.workload_profile_mutex);
> +               mutex_unlock(&adev->vcn.lock);
>         } else {
>                 schedule_delayed_work(&vcn_inst->idle_work, VCN_IDLE_TIMEOUT);
>         }
> @@ -470,14 +469,8 @@ void amdgpu_vcn_ring_begin_use(struct amdgpu_ring *ring)
>
>         cancel_delayed_work_sync(&vcn_inst->idle_work);
>
> -       /* We can safely return early here because we've cancelled the
> -        * the delayed work so there is no one else to set it to false
> -        * and we don't care if someone else sets it to true.
> -        */
> -       if (adev->vcn.workload_profile_active)
> -               goto pg_lock;
> +       mutex_lock(&adev->vcn.lock);
>
> -       mutex_lock(&adev->vcn.workload_profile_mutex);
>         if (!adev->vcn.workload_profile_active) {
>                 r = amdgpu_dpm_switch_power_profile(adev, PP_SMC_POWER_PROFILE_VIDEO,
>                                                     true);
> @@ -485,11 +478,11 @@ void amdgpu_vcn_ring_begin_use(struct amdgpu_ring *ring)
>                         dev_warn(adev->dev, "(%d) failed to switch to video power profile mode\n", r);
>                 adev->vcn.workload_profile_active = true;
>         }
> -       mutex_unlock(&adev->vcn.workload_profile_mutex);
>
> -pg_lock:
> -       mutex_lock(&vcn_inst->vcn_pg_lock);
> -       vcn_inst->set_pg_state(vcn_inst, AMD_PG_STATE_UNGATE);
> +       if (!(adev->vcn.flags & AMDGPU_VCN_FLAG_VINST_ON(vcn_inst->inst))) {
> +               vcn_inst->set_pg_state(vcn_inst, AMD_PG_STATE_UNGATE);
> +               adev->vcn.flags |= AMDGPU_VCN_FLAG_VINST_ON(vcn_inst->inst);
> +       }
>
>         /* Only set DPG pause for VCN3 or below, VCN4 and above will be handled by FW */
>         if (adev->pg_flags & AMD_PG_SUPPORT_VCN_DPG &&
> @@ -514,7 +507,7 @@ void amdgpu_vcn_ring_begin_use(struct amdgpu_ring *ring)
>
>                 vcn_inst->pause_dpg_mode(vcn_inst, &new_state);
>         }
> -       mutex_unlock(&vcn_inst->vcn_pg_lock);
> +       mutex_unlock(&adev->vcn.lock);
>  }
>
>  void amdgpu_vcn_ring_end_use(struct amdgpu_ring *ring)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
> index b3fb1d0e43fc..4457dcc5f9dc 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
> @@ -320,7 +320,6 @@ struct amdgpu_vcn_inst {
>         uint8_t                 vcn_config;
>         uint32_t                vcn_codec_disable_mask;
>         atomic_t                total_submission_cnt;
> -       struct mutex            vcn_pg_lock;
>         enum amd_powergating_state cur_state;
>         struct delayed_work     idle_work;
>         unsigned                fw_version;
> @@ -363,9 +362,13 @@ struct amdgpu_vcn {
>         unsigned                fw_version;
>
>         bool                    workload_profile_active;
> -       struct mutex            workload_profile_mutex;
> +       struct mutex            lock;
>         u32 reg_count;
>         const struct amdgpu_hwip_reg_entry *reg_list;
> +#define AMDGPU_VCN_FLAG_VINST_MASK(n)  (BIT(n+1) - 1)
> +#define AMDGPU_VCN_FLAG_VINST_ON(n)    (BIT(n))
> +#define AMDGPU_VCN_FLAG_VINST_OFF(n)   (~BIT(n))
> +       u32 flags;
>  };
>
>  struct amdgpu_fw_shared_rb_ptrs_struct {
> diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c b/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c
> index 3a7c137a83ef..344ab5c4cb18 100644
> --- a/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c
> +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c
> @@ -147,9 +147,9 @@ static void vcn_v2_5_idle_work_handler(struct work_struct *work)
>         }
>
>         if (!fences && !atomic_read(&adev->vcn.inst[0].total_submission_cnt)) {
> +               mutex_lock(&adev->vcn.lock);
>                 amdgpu_device_ip_set_powergating_state(adev, AMD_IP_BLOCK_TYPE_VCN,
>                                                        AMD_PG_STATE_GATE);
> -               mutex_lock(&adev->vcn.workload_profile_mutex);
>                 if (adev->vcn.workload_profile_active) {
>                         r = amdgpu_dpm_switch_power_profile(adev, PP_SMC_POWER_PROFILE_VIDEO,
>                                                             false);
> @@ -157,7 +157,7 @@ static void vcn_v2_5_idle_work_handler(struct work_struct *work)
>                                 dev_warn(adev->dev, "(%d) failed to disable video power profile mode\n", r);
>                         adev->vcn.workload_profile_active = false;
>                 }
> -               mutex_unlock(&adev->vcn.workload_profile_mutex);
> +               mutex_unlock(&adev->vcn.lock);
>         } else {
>                 schedule_delayed_work(&adev->vcn.inst[0].idle_work, VCN_IDLE_TIMEOUT);
>         }
> @@ -173,14 +173,7 @@ static void vcn_v2_5_ring_begin_use(struct amdgpu_ring *ring)
>
>         cancel_delayed_work_sync(&adev->vcn.inst[0].idle_work);
>
> -       /* We can safely return early here because we've cancelled the
> -        * the delayed work so there is no one else to set it to false
> -        * and we don't care if someone else sets it to true.
> -        */
> -       if (adev->vcn.workload_profile_active)
> -               goto pg_lock;
> -
> -       mutex_lock(&adev->vcn.workload_profile_mutex);
> +       mutex_lock(&adev->vcn.lock);
>         if (!adev->vcn.workload_profile_active) {
>                 r = amdgpu_dpm_switch_power_profile(adev, PP_SMC_POWER_PROFILE_VIDEO,
>                                                     true);
> @@ -188,10 +181,7 @@ static void vcn_v2_5_ring_begin_use(struct amdgpu_ring *ring)
>                         dev_warn(adev->dev, "(%d) failed to switch to video power profile mode\n", r);
>                 adev->vcn.workload_profile_active = true;
>         }
> -       mutex_unlock(&adev->vcn.workload_profile_mutex);
>
> -pg_lock:
> -       mutex_lock(&adev->vcn.inst[0].vcn_pg_lock);
>         amdgpu_device_ip_set_powergating_state(adev, AMD_IP_BLOCK_TYPE_VCN,
>                                                AMD_PG_STATE_UNGATE);
>
> @@ -217,7 +207,7 @@ static void vcn_v2_5_ring_begin_use(struct amdgpu_ring *ring)
>                 }
>                 v->pause_dpg_mode(v, &new_state);
>         }
> -       mutex_unlock(&adev->vcn.inst[0].vcn_pg_lock);
> +       mutex_unlock(&adev->vcn.lock);
>  }
>
>  static void vcn_v2_5_ring_end_use(struct amdgpu_ring *ring)
> --
> 2.48.1
>