[PATCH 03/12] drm/amd/display: fix dmub access race condition
Klara Modin
klarasmodin at gmail.com
Tue Aug 5 11:51:41 UTC 2025
Hi,
On 2025-07-30 14:58:54 -0400, Roman.Li at amd.com wrote:
> From: Aurabindo Pillai <aurabindo.pillai at amd.com>
>
> Accessing DC from amdgpu_dm is usually preceded by acquisition of
> dc_lock mutex. Most of the DC API that DM calls are under a DC lock.
> However, there are a few that are not. Some DC API called from interrupt
> context end up sending DMUB commands via a DC API, while other threads were
> using DMUB. This was apparent from a race between calls for setting idle
> optimization enable/disable and the DC API to set vmin/vmax.
>
> Offload the call to dc_stream_adjust_vmin_vmax() to a thread instead
> of directly calling them from the interrupt handler such that it waits
> for dc_lock.
>
> Reviewed-by: Nicholas Kazlauskas <nicholas.kazlauskas at amd.com>
> Signed-off-by: Aurabindo Pillai <aurabindo.pillai at amd.com>
> Signed-off-by: Roman Li <roman.li at amd.com>
With this patch I get sleeping function called from invalid context
roughly every second. This occurs on at least PREEMPT_LAZY and
PREEMPT_VOLUNTARY.
BUG: sleeping function called from invalid context at include/linux/sched/mm.h:321
in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 13841, name: cc1
preempt_count: 10002, expected: 0
CPU: 23 UID: 1000 PID: 13841 Comm: cc1 Kdump: loaded Tainted: G W 6.16.0-next-20250805 #631 PREEMPTLAZY
Tainted: [W]=WARN
Hardware name: Micro-Star International Co., Ltd. MS-7C91/MAG B550 TOMAHAWK (MS-7C91), BIOS A.I0 03/20/2025
Call Trace:
<TASK>
dump_stack_lvl+0x4b/0x70
__might_resched.cold+0xac/0xb9
__kmalloc_cache_noprof+0x2d4/0x430
? schedule_dc_vmin_vmax+0x49/0x1c0 [amdgpu]
? __alloc_frozen_pages_noprof+0x18f/0x360
? _raw_spin_unlock_irqrestore+0x12/0x40
? schedule_dc_vmin_vmax+0x49/0x1c0 [amdgpu]
schedule_dc_vmin_vmax+0x49/0x1c0 [amdgpu]
dm_crtc_high_irq+0x2ab/0x310 [amdgpu]
amdgpu_dm_irq_handler+0x8d/0x210 [amdgpu]
amdgpu_irq_dispatch+0x166/0x1a0 [amdgpu]
amdgpu_ih_process+0x60/0x160 [amdgpu]
amdgpu_irq_handler+0x23/0x60 [amdgpu]
__handle_irq_event_percpu+0x4a/0x1a0
handle_irq_event+0x38/0x90
handle_edge_irq+0xc4/0x190
__common_interrupt+0x44/0xe0
? srso_return_thunk+0x5/0x5f
common_interrupt+0x3a/0xa0
asm_common_interrupt+0x26/0x40
RIP: 0033:0x55d4848fab81
Code: 28 44 8b 5f 70 ff 87 80 00 00 00 49 89 d7 48 89 34 24 45 8d 73 ff 44 89 f3 21 cb 48 8b 4f 58 41 89 dd 49 c1 e5 03 4a 8b 2c 29 <48> 85 ed 0f 84 36 02 00 00 48 83 fd ff 0f 84 bc 00 00 00 44 39 4d
RSP: 002b:00007fff71eeed50 EFLAGS: 00000212
RAX: 000000000000001f RBX: 0000000000002182 RCX: 00007f47b3f00010
RDX: 000000000000001f RSI: 000055d4bfce1e3c RDI: 000055d4bfbb2e80
RBP: 0000000000000000 R08: 0000000000000001 R09: 000000006d35e182
R10: 000055d4bfb94630 R11: 0000000000004000 R12: 000055d4bfbb2e80
R13: 0000000000010c10 R14: 0000000000003fff R15: 000000000000001f
</TASK>
...
> +static void schedule_dc_vmin_vmax(struct amdgpu_device *adev,
> + struct dc_stream_state *stream,
> + struct dc_crtc_timing_adjust *adjust)
> +{
> + struct vupdate_offload_work *offload_work = kzalloc(sizeof(*offload_work), GFP_KERNEL);
> + if (!offload_work) {
> + drm_dbg_driver(adev_to_drm(adev), "Failed to allocate vupdate_offload_work\n");
> + return;
> + }
> +
> + struct dc_crtc_timing_adjust *adjust_copy = kzalloc(sizeof(*adjust_copy), GFP_KERNEL);
> + if (!adjust_copy) {
> + drm_dbg_driver(adev_to_drm(adev), "Failed to allocate adjust_copy\n");
> + kfree(offload_work);
> + return;
> + }
> +
> + dc_stream_retain(stream);
> + memcpy(adjust_copy, adjust, sizeof(*adjust_copy));
> +
> + INIT_WORK(&offload_work->work, dm_handle_vmin_vmax_update);
> + offload_work->adev = adev;
> + offload_work->stream = stream;
> + offload_work->adjust = adjust_copy;
> +
> + queue_work(system_wq, &offload_work->work);
> +}
> +
The allocations in this function seems to be the culprit. If I change
them to GFP_ATOMIC the issue disappears:
diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
index 31ea57edeb45..afe0fea13bb0 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
@@ -550,13 +550,13 @@ static void schedule_dc_vmin_vmax(struct amdgpu_device *adev,
struct dc_stream_state *stream,
struct dc_crtc_timing_adjust *adjust)
{
- struct vupdate_offload_work *offload_work = kzalloc(sizeof(*offload_work), GFP_KERNEL);
+ struct vupdate_offload_work *offload_work = kzalloc(sizeof(*offload_work), GFP_ATOMIC);
if (!offload_work) {
drm_dbg_driver(adev_to_drm(adev), "Failed to allocate vupdate_offload_work\n");
return;
}
- struct dc_crtc_timing_adjust *adjust_copy = kzalloc(sizeof(*adjust_copy), GFP_KERNEL);
+ struct dc_crtc_timing_adjust *adjust_copy = kzalloc(sizeof(*adjust_copy), GFP_ATOMIC);
if (!adjust_copy) {
drm_dbg_driver(adev_to_drm(adev), "Failed to allocate adjust_copy\n");
kfree(offload_work);
...
Regards,
Klara Modin
More information about the amd-gfx
mailing list