[PATCH v2] drm/amdgpu: Replace Mutex with Spinlock for RLCG register access to avoid Priority Inversion in SRIOV
Christian König
christian.koenig at amd.com
Mon Feb 17 12:27:35 UTC 2025
Am 16.02.25 um 03:43 schrieb Srinivasan Shanmugam:
> RLCG Register Access is a way for virtual functions to safely access GPU
> registers in a virtualized environment., including TLB flushes and
> register reads. When multiple threads or VFs try to access the same
> registers simultaneously, it can lead to race conditions. By using the
> RLCG interface, the driver can serialize access to the registers. This
> means that only one thread can access the registers at a time,
> preventing conflicts and ensuring that operations are performed
> correctly. Additionally, when a low-priority task holds a mutex that a
> high-priority task needs, ie., If a thread holding a spinlock tries to
> acquire a mutex, it can lead to priority inversion. register access in
> amdgpu_virt_rlcg_reg_rw especially in a fast code path is critical.
>
> The call stack shows that the function amdgpu_virt_rlcg_reg_rw is being
> called, which attempts to acquire the mutex. This function is invoked
> from amdgpu_sriov_wreg, which in turn is called from
> gmc_v11_0_flush_gpu_tlb.
>
> The [ BUG: Invalid wait context ] indicates that a thread is trying to
> acquire a mutex while it is in a context that does not allow it to sleep
> (like holding a spinlock).
>
> Fixes the below:
>
> [ 253.013423] =============================
> [ 253.013434] [ BUG: Invalid wait context ]
> [ 253.013446] 6.12.0-amdstaging-drm-next-lol-050225 #14 Tainted: G U OE
> [ 253.013464] -----------------------------
> [ 253.013475] kworker/0:1/10 is trying to lock:
> [ 253.013487] ffff9f30542e3cf8 (&adev->virt.rlcg_reg_lock){+.+.}-{3:3}, at: amdgpu_virt_rlcg_reg_rw+0xf6/0x330 [amdgpu]
> [ 253.013815] other info that might help us debug this:
> [ 253.013827] context-{4:4}
> [ 253.013835] 3 locks held by kworker/0:1/10:
> [ 253.013847] #0: ffff9f3040050f58 ((wq_completion)events){+.+.}-{0:0}, at: process_one_work+0x3f5/0x680
> [ 253.013877] #1: ffffb789c008be40 ((work_completion)(&wfc.work)){+.+.}-{0:0}, at: process_one_work+0x1d6/0x680
> [ 253.013905] #2: ffff9f3054281838 (&adev->gmc.invalidate_lock){+.+.}-{2:2}, at: gmc_v11_0_flush_gpu_tlb+0x198/0x4f0 [amdgpu]
> [ 253.014154] stack backtrace:
> [ 253.014164] CPU: 0 UID: 0 PID: 10 Comm: kworker/0:1 Tainted: G U OE 6.12.0-amdstaging-drm-next-lol-050225 #14
> [ 253.014189] Tainted: [U]=USER, [O]=OOT_MODULE, [E]=UNSIGNED_MODULE
> [ 253.014203] Hardware name: Microsoft Corporation Virtual Machine/Virtual Machine, BIOS Hyper-V UEFI Release v4.1 11/18/2024
> [ 253.014224] Workqueue: events work_for_cpu_fn
> [ 253.014241] Call Trace:
> [ 253.014250] <TASK>
> [ 253.014260] dump_stack_lvl+0x9b/0xf0
> [ 253.014275] dump_stack+0x10/0x20
> [ 253.014287] __lock_acquire+0xa47/0x2810
> [ 253.014303] ? srso_alias_return_thunk+0x5/0xfbef5
> [ 253.014321] lock_acquire+0xd1/0x300
> [ 253.014333] ? amdgpu_virt_rlcg_reg_rw+0xf6/0x330 [amdgpu]
> [ 253.014562] ? __lock_acquire+0xa6b/0x2810
> [ 253.014578] __mutex_lock+0x85/0xe20
> [ 253.014591] ? amdgpu_virt_rlcg_reg_rw+0xf6/0x330 [amdgpu]
> [ 253.014782] ? sched_clock_noinstr+0x9/0x10
> [ 253.014795] ? srso_alias_return_thunk+0x5/0xfbef5
> [ 253.014808] ? local_clock_noinstr+0xe/0xc0
> [ 253.014822] ? amdgpu_virt_rlcg_reg_rw+0xf6/0x330 [amdgpu]
> [ 253.015012] ? srso_alias_return_thunk+0x5/0xfbef5
> [ 253.015029] mutex_lock_nested+0x1b/0x30
> [ 253.015044] ? mutex_lock_nested+0x1b/0x30
> [ 253.015057] amdgpu_virt_rlcg_reg_rw+0xf6/0x330 [amdgpu]
> [ 253.015249] amdgpu_sriov_wreg+0xc5/0xd0 [amdgpu]
> [ 253.015435] gmc_v11_0_flush_gpu_tlb+0x44b/0x4f0 [amdgpu]
> [ 253.015667] gfx_v11_0_hw_init+0x499/0x29c0 [amdgpu]
> [ 253.015901] ? __pfx_smu_v13_0_update_pcie_parameters+0x10/0x10 [amdgpu]
> [ 253.016159] ? srso_alias_return_thunk+0x5/0xfbef5
> [ 253.016173] ? smu_hw_init+0x18d/0x300 [amdgpu]
> [ 253.016403] amdgpu_device_init+0x29ad/0x36a0 [amdgpu]
> [ 253.016614] amdgpu_driver_load_kms+0x1a/0xc0 [amdgpu]
> [ 253.017057] amdgpu_pci_probe+0x1c2/0x660 [amdgpu]
> [ 253.017493] local_pci_probe+0x4b/0xb0
> [ 253.017746] work_for_cpu_fn+0x1a/0x30
> [ 253.017995] process_one_work+0x21e/0x680
> [ 253.018248] worker_thread+0x190/0x330
> [ 253.018500] ? __pfx_worker_thread+0x10/0x10
> [ 253.018746] kthread+0xe7/0x120
> [ 253.018988] ? __pfx_kthread+0x10/0x10
> [ 253.019231] ret_from_fork+0x3c/0x60
> [ 253.019468] ? __pfx_kthread+0x10/0x10
> [ 253.019701] ret_from_fork_asm+0x1a/0x30
> [ 253.019939] </TASK>
>
> Fixes: e864180ee49b ("drm/amdgpu: Add lock around VF RLCG interface")
> Cc: lin cao <lin.cao at amd.com>
> Cc: Jingwen Chen <Jingwen.Chen2 at amd.com>
> Cc: Victor Skvortsov <victor.skvortsov at amd.com>
> Cc: Zhigang Luo <zhigang.luo at amd.com>
> Cc: Christian König <christian.koenig at amd.com>
> Cc: Alex Deucher <alexander.deucher at amd.com>
> Signed-off-by: Srinivasan Shanmugam <srinivasan.shanmugam at amd.com>
Reviewed-by: Christian König <christian.koenig at amd.com>
> ---
> v2: s/spin_trylock/spin_lock_irqsave to be safe (Christian).
>
> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 +-
> drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 5 +++--
> drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 3 ++-
> 3 files changed, 6 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index eab530778fbd..14125cc3a937 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -4251,7 +4251,6 @@ int amdgpu_device_init(struct amdgpu_device *adev,
> mutex_init(&adev->grbm_idx_mutex);
> mutex_init(&adev->mn_lock);
> mutex_init(&adev->virt.vf_errors.lock);
> - mutex_init(&adev->virt.rlcg_reg_lock);
> hash_init(adev->mn_hash);
> mutex_init(&adev->psp.mutex);
> mutex_init(&adev->notifier_lock);
> @@ -4277,6 +4276,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
> spin_lock_init(&adev->se_cac_idx_lock);
> spin_lock_init(&adev->audio_endpt_idx_lock);
> spin_lock_init(&adev->mm_stats.lock);
> + spin_lock_init(&adev->virt.rlcg_reg_lock);
> spin_lock_init(&adev->wb.lock);
>
> xa_init_flags(&adev->userq_xa, XA_FLAGS_LOCK_IRQ);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> index 2056efaf157d..e6f0152e5b08 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> @@ -1017,6 +1017,7 @@ u32 amdgpu_virt_rlcg_reg_rw(struct amdgpu_device *adev, u32 offset, u32 v, u32 f
> void *scratch_reg2;
> void *scratch_reg3;
> void *spare_int;
> + unsigned long flags;
>
> if (!adev->gfx.rlc.rlcg_reg_access_supported) {
> dev_err(adev->dev,
> @@ -1038,7 +1039,7 @@ u32 amdgpu_virt_rlcg_reg_rw(struct amdgpu_device *adev, u32 offset, u32 v, u32 f
> scratch_reg2 = (void __iomem *)adev->rmmio + 4 * reg_access_ctrl->scratch_reg2;
> scratch_reg3 = (void __iomem *)adev->rmmio + 4 * reg_access_ctrl->scratch_reg3;
>
> - mutex_lock(&adev->virt.rlcg_reg_lock);
> + spin_lock_irqsave(&adev->virt.rlcg_reg_lock, flags);
>
> if (reg_access_ctrl->spare_int)
> spare_int = (void __iomem *)adev->rmmio + 4 * reg_access_ctrl->spare_int;
> @@ -1097,7 +1098,7 @@ u32 amdgpu_virt_rlcg_reg_rw(struct amdgpu_device *adev, u32 offset, u32 v, u32 f
>
> ret = readl(scratch_reg0);
>
> - mutex_unlock(&adev->virt.rlcg_reg_lock);
> + spin_unlock_irqrestore(&adev->virt.rlcg_reg_lock, flags);
>
> return ret;
> }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> index 270a032e2d70..0f3ccae5c1ab 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> @@ -279,7 +279,8 @@ struct amdgpu_virt {
> /* the ucode id to signal the autoload */
> uint32_t autoload_ucode_id;
>
> - struct mutex rlcg_reg_lock;
> + /* Spinlock to protect access to the RLCG register interface */
> + spinlock_t rlcg_reg_lock;
>
> union amd_sriov_ras_caps ras_en_caps;
> union amd_sriov_ras_caps ras_telemetry_en_caps;
More information about the amd-gfx
mailing list