[PATCH 14/26] drm/amd/display: fix a memleak issue when driver is removed

Rodrigo Siqueira Rodrigo.Siqueira at amd.com
Thu Oct 3 23:33:32 UTC 2024


From: Aurabindo Pillai <aurabindo.pillai at amd.com>

Running "modprobe amdgpu" the second time (followed by a modprobe -r
amdgpu) causes a call trace like:

[  845.212163] Memory manager not clean during takedown.
[  845.212170] WARNING: CPU: 4 PID: 2481 at drivers/gpu/drm/drm_mm.c:999 drm_mm_takedown+0x2b/0x40
[  845.212177] Modules linked in: amdgpu(OE-) amddrm_ttm_helper(OE) amddrm_buddy(OE) amdxcp(OE) amd_sched(OE) drm_exec drm_suballoc_helper drm_display_helper i2c_algo_bit amdttm(OE) amdkcl(OE) cec rc_core sunrpc qrtr intel_rapl_msr intel_rapl_common snd_hda_codec_hdmi edac_mce_amd snd_hda_intel snd_intel_dspcfg snd_intel_sdw_acpi snd_usb_audio snd_hda_codec snd_usbmidi_lib kvm_amd snd_hda_core snd_ump mc snd_hwdep kvm snd_pcm snd_seq_midi snd_seq_midi_event irqbypass crct10dif_pclmul snd_rawmidi polyval_clmulni polyval_generic ghash_clmulni_intel sha256_ssse3 sha1_ssse3 snd_seq aesni_intel crypto_simd snd_seq_device cryptd snd_timer mfd_aaeon asus_nb_wmi eeepc_wmi joydev asus_wmi snd ledtrig_audio sparse_keymap ccp wmi_bmof input_leds k10temp i2c_piix4 platform_profile rapl soundcore gpio_amdpt mac_hid binfmt_misc msr parport_pc ppdev lp parport efi_pstore nfnetlink dmi_sysfs ip_tables x_tables autofs4 hid_logitech_hidpp hid_logitech_dj hid_generic usbhid hid ahci xhci_pci igc crc32_pclmul libahci xhci_pci_renesas video
[  845.212284]  wmi [last unloaded: amddrm_ttm_helper(OE)]
[  845.212290] CPU: 4 PID: 2481 Comm: modprobe Tainted: G        W  OE      6.8.0-31-generic #31-Ubuntu
[  845.212296] RIP: 0010:drm_mm_takedown+0x2b/0x40
[  845.212300] Code: 1f 44 00 00 48 8b 47 38 48 83 c7 38 48 39 f8 75 09 31 c0 31 ff e9 90 2e 86 00 55 48 c7 c7 d0 f6 8e 8a 48 89 e5 e8 f5 db 45 ff <0f> 0b 5d 31 c0 31 ff e9 74 2e 86 00 66 0f 1f 84 00 00 00 00 00 90
[  845.212302] RSP: 0018:ffffb11302127ae0 EFLAGS: 00010246
[  845.212305] RAX: 0000000000000000 RBX: ffff92aa5020fc08 RCX: 0000000000000000
[  845.212307] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000
[  845.212309] RBP: ffffb11302127ae0 R08: 0000000000000000 R09: 0000000000000000
[  845.212310] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000004
[  845.212312] R13: ffff92aa50200000 R14: ffff92aa5020fb10 R15: ffff92aa5020faa0
[  845.212313] FS:  0000707dd7c7c080(0000) GS:ffff92b93de00000(0000) knlGS:0000000000000000
[  845.212316] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  845.212318] CR2: 00007d48b0aee200 CR3: 0000000115a58000 CR4: 0000000000f50ef0
[  845.212320] PKRU: 55555554
[  845.212321] Call Trace:
[  845.212323]  <TASK>
[  845.212328]  ? show_regs+0x6d/0x80
[  845.212333]  ? __warn+0x89/0x160
[  845.212339]  ? drm_mm_takedown+0x2b/0x40
[  845.212344]  ? report_bug+0x17e/0x1b0
[  845.212350]  ? handle_bug+0x51/0xa0
[  845.212355]  ? exc_invalid_op+0x18/0x80
[  845.212359]  ? asm_exc_invalid_op+0x1b/0x20
[  845.212366]  ? drm_mm_takedown+0x2b/0x40
[  845.212371]  amdgpu_gtt_mgr_fini+0xa9/0x130 [amdgpu]
[  845.212645]  amdgpu_ttm_fini+0x264/0x340 [amdgpu]
[  845.212770]  amdgpu_bo_fini+0x2e/0xc0 [amdgpu]
[  845.212894]  gmc_v12_0_sw_fini+0x2a/0x40 [amdgpu]
[  845.213036]  amdgpu_device_fini_sw+0x11a/0x590 [amdgpu]
[  845.213159]  amdgpu_driver_release_kms+0x16/0x40 [amdgpu]
[  845.213302]  devm_drm_dev_init_release+0x5e/0x90
[  845.213305]  devm_action_release+0x12/0x30
[  845.213308]  release_nodes+0x42/0xd0
[  845.213311]  devres_release_all+0x97/0xe0
[  845.213314]  device_unbind_cleanup+0x12/0x80
[  845.213317]  device_release_driver_internal+0x230/0x270
[  845.213319]  ? srso_alias_return_thunk+0x5/0xfbef5

This is caused by lost memory during early init phase. First time driver
is removed, memory is freed but when second time the driver is inserted,
VBIOS dmub is not active, since the PSP policy is to retain the driver
loaded version on subsequent warm boots. Hence, communication with VBIOS
DMUB fails.

Fix this by aborting further communication with vbios dmub and release
the memory immediately.

Fixes: 3284f08a2324 ("drm/amd/display: free bo used for dmub bounding box")
Reviewed-by: Rodrigo Siqueira <rodrigo.siqueira at amd.com>
Signed-off-by: Aurabindo Pillai <aurabindo.pillai at amd.com>
Signed-off-by: Rodrigo Siqueira <rodrigo.siqueira at amd.com>
---
 .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 32 ++++++++++++++++---
 .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h |  3 ++
 .../amd/display/amdgpu_dm/amdgpu_dm_helpers.c | 13 ++------
 3 files changed, 33 insertions(+), 15 deletions(-)

diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
index 7616d7a509b9..73c393d6c457 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
@@ -1696,6 +1696,26 @@ dm_allocate_gpu_mem(
 	return da->cpu_ptr;
 }
 
+void
+dm_free_gpu_mem(
+		struct amdgpu_device *adev,
+		enum dc_gpu_mem_alloc_type type,
+		void *pvMem)
+{
+	struct dal_allocation *da;
+
+	/* walk the da list in DM */
+	list_for_each_entry(da, &adev->dm.da_list, list) {
+		if (pvMem == da->cpu_ptr) {
+			amdgpu_bo_free_kernel(&da->bo, &da->gpu_addr, &da->cpu_ptr);
+			list_del(&da->list);
+			kfree(da);
+			break;
+		}
+	}
+
+}
+
 static enum dmub_status
 dm_dmub_send_vbios_gpint_command(struct amdgpu_device *adev,
 				 enum dmub_gpint_command command_code,
@@ -1762,16 +1782,20 @@ static struct dml2_soc_bb *dm_dmub_get_vbios_bounding_box(struct amdgpu_device *
 		/* Send the chunk */
 		ret = dm_dmub_send_vbios_gpint_command(adev, send_addrs[i], chunk, 30000);
 		if (ret != DMUB_STATUS_OK)
-			/* No need to free bb here since it shall be done in dm_sw_fini() */
-			return NULL;
+			goto free_bb;
 	}
 
 	/* Now ask DMUB to copy the bb */
 	ret = dm_dmub_send_vbios_gpint_command(adev, DMUB_GPINT__BB_COPY, 1, 200000);
 	if (ret != DMUB_STATUS_OK)
-		return NULL;
+		goto free_bb;
 
 	return bb;
+
+free_bb:
+	dm_free_gpu_mem(adev, DC_MEM_ALLOC_TYPE_GART, (void *) bb);
+	return NULL;
+
 }
 
 static enum dmub_ips_disable_type dm_get_default_ips_mode(
@@ -2541,11 +2565,11 @@ static int dm_sw_fini(struct amdgpu_ip_block *ip_block)
 			amdgpu_bo_free_kernel(&da->bo, &da->gpu_addr, &da->cpu_ptr);
 			list_del(&da->list);
 			kfree(da);
+			adev->dm.bb_from_dmub = NULL;
 			break;
 		}
 	}
 
-	adev->dm.bb_from_dmub = NULL;
 
 	kfree(adev->dm.dmub_fb_info);
 	adev->dm.dmub_fb_info = NULL;
diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h
index 15d4690c74d6..f5189b54a5cd 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h
@@ -1004,6 +1004,9 @@ void *dm_allocate_gpu_mem(struct amdgpu_device *adev,
 						  enum dc_gpu_mem_alloc_type type,
 						  size_t size,
 						  long long *addr);
+void dm_free_gpu_mem(struct amdgpu_device *adev,
+						  enum dc_gpu_mem_alloc_type type,
+						  void *addr);
 
 bool amdgpu_dm_is_headless(struct amdgpu_device *adev);
 
diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c
index 069e0195e50a..3f4b6f140374 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c
@@ -1054,17 +1054,8 @@ void dm_helpers_free_gpu_mem(
 		void *pvMem)
 {
 	struct amdgpu_device *adev = ctx->driver_context;
-	struct dal_allocation *da;
-
-	/* walk the da list in DM */
-	list_for_each_entry(da, &adev->dm.da_list, list) {
-		if (pvMem == da->cpu_ptr) {
-			amdgpu_bo_free_kernel(&da->bo, &da->gpu_addr, &da->cpu_ptr);
-			list_del(&da->list);
-			kfree(da);
-			break;
-		}
-	}
+
+	dm_free_gpu_mem(adev, type, pvMem);
 }
 
 bool dm_helpers_dmub_outbox_interrupt_control(struct dc_context *ctx, bool enable)
-- 
2.45.2



More information about the amd-gfx mailing list