[PATCH 03/12] drm/amd/display: fix dmub access race condition

Tue Aug 5 11:51:41 UTC 2025

Hi,

On 2025-07-30 14:58:54 -0400, Roman.Li at amd.com wrote:
> From: Aurabindo Pillai <aurabindo.pillai at amd.com>
> 
> Accessing DC from amdgpu_dm is usually preceded by acquisition of
> dc_lock mutex. Most of the DC API that DM calls are under a DC lock.
> However, there are a few that are not. Some DC API called from interrupt
> context end up sending DMUB commands via a DC API, while other threads were
> using DMUB. This was apparent from a race between calls for setting idle
> optimization enable/disable and the DC API to set vmin/vmax.
> 
> Offload the call to dc_stream_adjust_vmin_vmax() to a thread instead
> of directly calling them from the interrupt handler such that it waits
> for dc_lock.
> 
> Reviewed-by: Nicholas Kazlauskas <nicholas.kazlauskas at amd.com>
> Signed-off-by: Aurabindo Pillai <aurabindo.pillai at amd.com>
> Signed-off-by: Roman Li <roman.li at amd.com>

With this patch I get sleeping function called from invalid context
roughly every second. This occurs on at least PREEMPT_LAZY and
PREEMPT_VOLUNTARY.

BUG: sleeping function called from invalid context at include/linux/sched/mm.h:321
in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 13841, name: cc1
preempt_count: 10002, expected: 0
CPU: 23 UID: 1000 PID: 13841 Comm: cc1 Kdump: loaded Tainted: G        W           6.16.0-next-20250805 #631 PREEMPTLAZY 
Tainted: [W]=WARN
Hardware name: Micro-Star International Co., Ltd. MS-7C91/MAG B550 TOMAHAWK (MS-7C91), BIOS A.I0 03/20/2025
Call Trace:
 <TASK>
 dump_stack_lvl+0x4b/0x70
 __might_resched.cold+0xac/0xb9
 __kmalloc_cache_noprof+0x2d4/0x430
 ? schedule_dc_vmin_vmax+0x49/0x1c0 [amdgpu]
 ? __alloc_frozen_pages_noprof+0x18f/0x360
 ? _raw_spin_unlock_irqrestore+0x12/0x40
 ? schedule_dc_vmin_vmax+0x49/0x1c0 [amdgpu]
 schedule_dc_vmin_vmax+0x49/0x1c0 [amdgpu]
 dm_crtc_high_irq+0x2ab/0x310 [amdgpu]
 amdgpu_dm_irq_handler+0x8d/0x210 [amdgpu]
 amdgpu_irq_dispatch+0x166/0x1a0 [amdgpu]
 amdgpu_ih_process+0x60/0x160 [amdgpu]
 amdgpu_irq_handler+0x23/0x60 [amdgpu]
 __handle_irq_event_percpu+0x4a/0x1a0
 handle_irq_event+0x38/0x90
 handle_edge_irq+0xc4/0x190
 __common_interrupt+0x44/0xe0
 ? srso_return_thunk+0x5/0x5f
 common_interrupt+0x3a/0xa0
 asm_common_interrupt+0x26/0x40
RIP: 0033:0x55d4848fab81
Code: 28 44 8b 5f 70 ff 87 80 00 00 00 49 89 d7 48 89 34 24 45 8d 73 ff 44 89 f3 21 cb 48 8b 4f 58 41 89 dd 49 c1 e5 03 4a 8b 2c 29 <48> 85 ed 0f 84 36 02 00 00 48 83 fd ff 0f 84 bc 00 00 00 44 39 4d
RSP: 002b:00007fff71eeed50 EFLAGS: 00000212
RAX: 000000000000001f RBX: 0000000000002182 RCX: 00007f47b3f00010
RDX: 000000000000001f RSI: 000055d4bfce1e3c RDI: 000055d4bfbb2e80
RBP: 0000000000000000 R08: 0000000000000001 R09: 000000006d35e182
R10: 000055d4bfb94630 R11: 0000000000004000 R12: 000055d4bfbb2e80
R13: 0000000000010c10 R14: 0000000000003fff R15: 000000000000001f
 </TASK>

...

> +static void schedule_dc_vmin_vmax(struct amdgpu_device *adev,
> +	struct dc_stream_state *stream,
> +	struct dc_crtc_timing_adjust *adjust)
> +{
> +	struct vupdate_offload_work *offload_work = kzalloc(sizeof(*offload_work), GFP_KERNEL);
> +	if (!offload_work) {
> +		drm_dbg_driver(adev_to_drm(adev), "Failed to allocate vupdate_offload_work\n");
> +		return;
> +	}
> +
> +	struct dc_crtc_timing_adjust *adjust_copy = kzalloc(sizeof(*adjust_copy), GFP_KERNEL);
> +	if (!adjust_copy) {
> +		drm_dbg_driver(adev_to_drm(adev), "Failed to allocate adjust_copy\n");
> +		kfree(offload_work);
> +		return;
> +	}
> +
> +	dc_stream_retain(stream);
> +	memcpy(adjust_copy, adjust, sizeof(*adjust_copy));
> +
> +	INIT_WORK(&offload_work->work, dm_handle_vmin_vmax_update);
> +	offload_work->adev = adev;
> +	offload_work->stream = stream;
> +	offload_work->adjust = adjust_copy;
> +
> +	queue_work(system_wq, &offload_work->work);
> +}
> +

The allocations in this function seems to be the culprit. If I change
them to GFP_ATOMIC the issue disappears:

diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
index 31ea57edeb45..afe0fea13bb0 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
@@ -550,13 +550,13 @@ static void schedule_dc_vmin_vmax(struct amdgpu_device *adev,
 	struct dc_stream_state *stream,
 	struct dc_crtc_timing_adjust *adjust)
 {
-	struct vupdate_offload_work *offload_work = kzalloc(sizeof(*offload_work), GFP_KERNEL);
+	struct vupdate_offload_work *offload_work = kzalloc(sizeof(*offload_work), GFP_ATOMIC);
 	if (!offload_work) {
 		drm_dbg_driver(adev_to_drm(adev), "Failed to allocate vupdate_offload_work\n");
 		return;
 	}
 
-	struct dc_crtc_timing_adjust *adjust_copy = kzalloc(sizeof(*adjust_copy), GFP_KERNEL);
+	struct dc_crtc_timing_adjust *adjust_copy = kzalloc(sizeof(*adjust_copy), GFP_ATOMIC);
 	if (!adjust_copy) {
 		drm_dbg_driver(adev_to_drm(adev), "Failed to allocate adjust_copy\n");
 		kfree(offload_work);

...

Regards,
Klara Modin