[PATCH] drm/amdgpu: Fix the kernel panic caused by RAS records exceed threshold

Zhou1, Tao Tao.Zhou1 at amd.com
Mon May 12 05:51:29 UTC 2025


[AMD Official Use Only - AMD Internal Distribution Only]

> -----Original Message-----
> From: Sun, Ce(Overlord) <Ce.Sun at amd.com>
> Sent: Monday, May 12, 2025 1:14 PM
> To: amd-gfx at lists.freedesktop.org
> Cc: Zhou1, Tao <Tao.Zhou1 at amd.com>; Zhang, Hawking
> <Hawking.Zhang at amd.com>; Sun, Ce(Overlord) <Ce.Sun at amd.com>
> Subject: [PATCH] drm/amdgpu: Fix the kernel panic caused by RAS records exceed
> threshold
>
> kernel panic caused by RAS records exceeding the threshold when load driver
> specifying RMA(bad_page_threshold=128) 1.Fix the warnings caused by disabling
> the interrupt source before it was enabled 2.xcp sysfs is not initialized,null pointer
> appears during fini 3.Fix the memory leak caused by the device's early exit due to
> rma
>
> The first reason:
> [ 2744.246650] ------------[ cut here ]------------ [ 2744.246651] WARNING: CPU: 0 PID:
> 289 at /tmp/amd.BkfTLqYV/amd/amdgpu/amdgpu_irq.c:635
> amdgpu_irq_put.cold+0x42/0x6e [amdgpu] [ 2744.247108] Modules linked in:
> amdgpu(OE+) amddrm_ttm_helper(OE) amdttm(OE) amdxcp(OE)
> amddrm_buddy(OE) amddrm_exec(OE) amd_sched(OE) amdkcl(OE) xt_conntrack
> nft_chain_nat xt_MASQUERADE nf_nat nf_conntrack_netlink nf_conntrack
> nf_defrag_ipv6 nf_defrag_ipv4 xfrm_user xfrm_algo nft_counter xt_addrtype
> nft_compat nf_tables nfnetlink br_netfilter bridge stp llc overlay binfmt_misc
> intel_rapl_msr intel_rapl_common i10nm_edac nfit x86_pkg_temp_thermal
> intel_powerclamp coretemp ipmi_ssif kvm_intel nls_iso8859_1 kvm rapl
> isst_if_mbox_pci isst_if_mmio pmt_telemetry pmt_crashlog isst_if_common
> pmt_class mei_me mei acpi_ipmi ipmi_si ipmi_devintf ipmi_msghandler
> acpi_power_meter acpi_pad mac_hid sch_fq_codel dm_multipath scsi_dh_rdac
> scsi_dh_emc scsi_dh_alua msr ramoops reed_solomon pstore_blk pstore_zone
> efi_pstore ip_tables x_tables autofs4 btrfs blake2b_generic zstd_compress raid10
> raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx xor
> raid6_pq libcrc32c raid1 raid0 multipath [ 2744.247167]  linear mlx5_ib ib_uverbs
> ib_core ast i2c_algo_bit drm_vram_helper drm_ttm_helper ttm drm_kms_helper
> crct10dif_pclmul syscopyarea crc32_pclmul ghash_clmulni_intel mlx5_core sysfillrect
> sysimgblt aesni_intel mlxfw fb_sys_fops psample cec crypto_simd cryptd rc_core
> i2c_i801 nvme xhci_pci tls intel_pmt drm pci_hyperv_intf nvme_core i2c_smbus
> i2c_ismt xhci_pci_renesas wmi pinctrl_emmitsburg
> [ 2744.247194] CPU: 0 PID: 289 Comm: kworker/0:1 Tainted: G           OE     5.15.0-
> 70-generic #77-Ubuntu
> [ 2744.247197] Hardware name: Microsoft C278A/C278A, BIOS
> C2789.5.BS.1C23.AG.2 11/21/2024 [ 2744.247198] Workqueue: events
> work_for_cpu_fn [ 2744.247206] RIP: 0010:amdgpu_irq_put.cold+0x42/0x6e
> [amdgpu] [ 2744.247634] Code: 79 7f ff 44 89 ee 48 c7 c7 4d 5a 42 c2 89 55 d4 e8
> 90 09 bc bf 8b 55 d4 4c 89 e6 4c 89 ff e8 3c 76 7f ff 8b 55 d4 84 c0 75 07 <0f> 0b e9
> 95 79 7f ff 49 03 5c 24 08 f0 ff 0b 75 13 4c 89 e6 4c 89 [ 2744.247636] RSP:
> 0018:ffa0000019e27cb0 EFLAGS: 00010246 [ 2744.247639] RAX:
> 0000000000000000 RBX: 0000000000000000 RCX: ff11000150fa87c0
> [ 2744.247641] RDX: 0000000000000000 RSI: ffffffffc2222430 RDI:
> ff1100019f200000 [ 2744.247642] RBP: ffa0000019e27ce0 R08: 0000000000000003
> R09: ffffffffffe41a08 [ 2744.247643] R10: 0000000000ffff0a R11: 0000000000000001
> R12: ff1100019f22ce60 [ 2744.247644] R13: 0000000000000000 R14:
> 00000000ffffffea R15: ff1100019f200000 [ 2744.247645] FS:
> 0000000000000000(0000) GS:ff11007e7e400000(0000) knlGS:0000000000000000
> [ 2744.247647] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [ 2744.247649] CR2: 00007f3d2002819c CR3: 0000000006810003 CR4:
> 0000000000771ef0 [ 2744.247650] DR0: 0000000000000000 DR1:
> 0000000000000000 DR2: 0000000000000000 [ 2744.247651] DR3:
> 0000000000000000 DR6: 00000000fffe07f0 DR7: 0000000000000400
> [ 2744.247652] PKRU: 55555554 [ 2744.247653] Call Trace:
> [ 2744.247654]  <TASK>
> [ 2744.247656]  sdma_v4_4_2_hw_fini+0x7a/0xc0 [amdgpu] [ 2744.247997]  ?
> vcn_v4_0_3_hw_fini+0x5f/0xa0 [amdgpu] [ 2744.248336]
> amdgpu_ip_block_hw_fini+0x31/0x61 [amdgpu] [ 2744.248776]
> amdgpu_device_fini_hw+0x3bb/0x47b [amdgpu] [ 2744.249197]  ?
> blocking_notifier_chain_unregister+0x56/0xb0
> [ 2744.249202]  amdgpu_driver_unload_kms+0x51/0x60 [amdgpu] [ 2744.249482]
> amdgpu_driver_load_kms.cold+0x18/0x2e [amdgpu] [ 2744.249913]
> amdgpu_pci_probe+0x23e/0x590 [amdgpu] [ 2744.250187]
> local_pci_probe+0x48/0x90 [ 2744.250191]  work_for_cpu_fn+0x17/0x30
> [ 2744.250196]  process_one_work+0x228/0x3d0 [ 2744.250198]
> worker_thread+0x223/0x420 [ 2744.250200]  ? process_one_work+0x3d0/0x3d0
> [ 2744.250201]  kthread+0x127/0x150 [ 2744.250204]  ?
> set_kthread_struct+0x50/0x50 [ 2744.250207]  ret_from_fork+0x1f/0x30
> [ 2744.250212]  </TASK> [ 2744.250213] ---[ end trace 488c997a88508bc3 ]---
>
> The second reason:
> [ 5139.303446] Memory manager not clean during takedown.
> [ 5139.303509] WARNING: CPU: 145 PID: 117699 at drivers/gpu/drm/drm_mm.c:998
> drm_mm_takedown+0x27/0x30 [drm] [ 5139.303542] Modules linked in:
> amdgpu(OE+) amddrm_ttm_helper(OE) amdttm(OE) amdxcp(OE)
> amddrm_buddy(OE) amddrm_exec(OE) amd_sched(OE) amdkcl(OE) xt_conntrack
> nft_chain_nat xt_MASQUERADE nf_nat nf_conntrack_netlink nf_conntrack
> nf_defrag_ipv6 nf_defrag_ipv4 xfrm_user xfrm_algo nft_counter xt_addrtype
> nft_compat nf_tables nfnetlink br_netfilter bridge stp llc overlay intel_rapl_msr
> intel_rapl_common i10nm_edac nfit x86_pkg_temp_thermal intel_powerclamp
> coretemp ipmi_ssif kvm_intel binfmt_misc kvm nls_iso8859_1 rapl isst_if_mbox_pci
> pmt_telemetry pmt_crashlog isst_if_mmio pmt_class isst_if_common mei_me mei
> acpi_ipmi ipmi_si ipmi_devintf ipmi_msghandler acpi_pad acpi_power_meter
> mac_hid sch_fq_codel dm_multipath scsi_dh_rdac scsi_dh_emc scsi_dh_alua msr
> ramoops reed_solomon pstore_blk pstore_zone efi_pstore ip_tables x_tables autofs4
> btrfs blake2b_generic zstd_compress raid10 raid456 async_raid6_recov
> async_memcpy async_pq async_xor async_tx xor raid6_pq libcrc32c raid1 raid0
> multipath [ 5139.303572]  linear mlx5_ib ib_uverbs ib_core crct10dif_pclmul ast
> crc32_pclmul i2c_algo_bit ghash_clmulni_intel aesni_intel crypto_simd
> drm_vram_helper cryptd drm_ttm_helper mlx5_core ttm drm_kms_helper
> syscopyarea sysfillrect sysimgblt fb_sys_fops cec rc_core mlxfw psample intel_pmt
> nvme xhci_pci drm tls i2c_i801 pci_hyperv_intf nvme_core i2c_smbus i2c_ismt
> xhci_pci_renesas wmi pinctrl_emmitsburg [last unloaded: amdkcl]
> [ 5139.303588] CPU: 145 PID: 117699 Comm: modprobe Tainted: G     U     OE
> 5.15.0-70-generic #77-Ubuntu
> [ 5139.303590] Hardware name: Microsoft C278A/C278A, BIOS
> C2789.5.BS.1C23.AG.2 11/21/2024 [ 5139.303591] RIP:
> 0010:drm_mm_takedown+0x27/0x30 [drm] [ 5139.303605] Code: cc 66 90 0f 1f 44
> 00 00 48 8b 47 38 48 83 c7 38 48 39 f8 75 05 c3 cc cc cc cc 55 48 c7 c7 18 d0 10 c0
> 48 89 e5 e8 5a bc c3 c1 <0f> 0b 5d c3 cc cc cc cc 90 0f 1f 44 00 00 55 b9 15 00 00
> 00 48 89 [ 5139.303607] RSP: 0018:ffa00000325c3940 EFLAGS: 00010286
> [ 5139.303608] RAX: 0000000000000000 RBX: ff1100012f5cecb0 RCX:
> 0000000000000027 [ 5139.303609] RDX: ff11007e7fa60588 RSI:
> 0000000000000001 RDI: ff11007e7fa60580 [ 5139.303610] RBP: ffa00000325c3940
> R08: 0000000000000003 R09: fffffffff00c2b78 [ 5139.303610] R10:
> 000000000000002b R11: 0000000000000001 R12: ff1100012f5cec00
> [ 5139.303611] R13: ff1100012138f068 R14: 0000000000000000 R15:
> ff1100012f5cec90 [ 5139.303611] FS:  00007f42ffca0000(0000)
> GS:ff11007e7fa40000(0000) knlGS:0000000000000000 [ 5139.303612] CS:  0010
> DS: 0000 ES: 0000 CR0: 0000000080050033 [ 5139.303613] CR2:
> 00007f23d945ab68 CR3: 00000001212ce005 CR4: 0000000000771ee0
> [ 5139.303614] DR0: 0000000000000000 DR1: 0000000000000000 DR2:
> 0000000000000000 [ 5139.303615] DR3: 0000000000000000 DR6:
> 00000000fffe07f0 DR7: 0000000000000400 [ 5139.303615] PKRU: 55555554
> [ 5139.303616] Call Trace:
> [ 5139.303617]  <TASK>
> [ 5139.303619]  amdttm_range_man_fini_nocheck+0xfe/0x1c0 [amdttm]
> [ 5139.303625]  amdgpu_ttm_fini+0x2ed/0x390 [amdgpu] [ 5139.303800]
> amdgpu_bo_fini+0x27/0xc0 [amdgpu] [ 5139.303959]  gmc_v9_0_sw_fini+0x63/0x90
> [amdgpu] [ 5139.304144]  amdgpu_device_fini_sw+0x125/0x6a0 [amdgpu]
> [ 5139.304302]  amdgpu_driver_release_kms+0x16/0x30 [amdgpu] [ 5139.304455]
> devm_drm_dev_init_release+0x4a/0x80 [drm] [ 5139.304472]
> devm_action_release+0x12/0x20 [ 5139.304476]  release_nodes+0x3d/0xb0
> [ 5139.304478]  devres_release_all+0x9b/0xd0 [ 5139.304480]
> really_probe+0x11d/0x420 [ 5139.304483]  __driver_probe_device+0x119/0x190
> [ 5139.304485]  driver_probe_device+0x23/0xc0 [ 5139.304487]
> __driver_attach+0xf7/0x1f0 [ 5139.304489]  ? __device_attach_driver+0x140/0x140
> [ 5139.304491]  bus_for_each_dev+0x7c/0xd0 [ 5139.304493]
> driver_attach+0x1e/0x30 [ 5139.304494]  bus_add_driver+0x148/0x220
> [ 5139.304496]  driver_register+0x95/0x100 [ 5139.304498]
> __pci_register_driver+0x68/0x70 [ 5139.304500]  amdgpu_init+0xbc/0x1000
> [amdgpu] [ 5139.304655]  ? 0xffffffffc0b8f000 [ 5139.304657]
> do_one_initcall+0x46/0x1e0 [ 5139.304659]  ?
> kmem_cache_alloc_trace+0x19e/0x2e0
> [ 5139.304663]  do_init_module+0x52/0x260 [ 5139.304665]
> load_module+0xb2b/0xbc0 [ 5139.304667]  __do_sys_finit_module+0xbf/0x120
> [ 5139.304669]  __x64_sys_finit_module+0x18/0x20 [ 5139.304670]
> do_syscall_64+0x59/0xc0 [ 5139.304673]  ? exit_to_user_mode_prepare+0x37/0xb0
> [ 5139.304676]  ? syscall_exit_to_user_mode+0x27/0x50
> [ 5139.304678]  ? __x64_sys_mmap+0x33/0x50 [ 5139.304680]  ?
> do_syscall_64+0x69/0xc0 [ 5139.304681]
> entry_SYSCALL_64_after_hwframe+0x61/0xcb
> [ 5139.304684] RIP: 0033:0x7f42ffdbf88d
> [ 5139.304686] Code: 5b 41 5c c3 66 0f 1f 84 00 00 00 00 00 f3 0f 1e fa 48 89 f8 48
> 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73
> 01 c3 48 8b 0d 73 b5 0f 00 f7 d8 64 89 01 48 [ 5139.304687] RSP:
> 002b:00007ffcb7427158 EFLAGS: 00000246 ORIG_RAX: 0000000000000139
> [ 5139.304688] RAX: ffffffffffffffda RBX: 000055ce8b8f3150 RCX: 00007f42ffdbf88d
> [ 5139.304689] RDX: 0000000000000000 RSI: 000055ce8b8f9a70 RDI:
> 000000000000000a [ 5139.304690] RBP: 0000000000040000 R08:
> 0000000000000000 R09: 0000000000000011 [ 5139.304690] R10:
> 000000000000000a R11: 0000000000000246 R12: 000055ce8b8f9a70
> [ 5139.304691] R13: 000055ce8b8f2ec0 R14: 000055ce8b8f2ab0 R15:
> 000055ce8b8f9aa0 [ 5139.304692]  </TASK> [ 5139.304693] ---[ end trace
> 8536b052f7883003 ]---
>
> Signed-off-by: Ce Sun <cesun102 at amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c | 5 +++++
> drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 2 ++
> drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c | 4 ++--
>  3 files changed, 9 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
> index 0e890f2785b1..ef9125a696ad 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
> @@ -621,6 +621,11 @@ int amdgpu_irq_get(struct amdgpu_device *adev, struct
> amdgpu_irq_src *src,  int amdgpu_irq_put(struct amdgpu_device *adev, struct
> amdgpu_irq_src *src,
>                  unsigned int type)
>  {
> +     struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
> +
> +     if(ras->is_rma)
> +             return -EINVAL;

[Tao] please add  a comment for the check, with this added, the patch is:

Reviewed-by: Tao Zhou <tao.zhou1 at amd.com>

> +
>       if (!adev->irq.installed)
>               return -ENOENT;
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> index cb7991362b5d..6f24e95c9112 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> @@ -2496,6 +2496,8 @@ void amdgpu_ttm_fini(struct amdgpu_device *adev)
>       amdgpu_vram_mgr_fini(adev);
>       amdgpu_gtt_mgr_fini(adev);
>       amdgpu_preempt_mgr_fini(adev);
> +     amdgpu_doorbell_fini(adev);
> +
>       ttm_range_man_fini(&adev->mman.bdev, AMDGPU_PL_GDS);
>       ttm_range_man_fini(&adev->mman.bdev, AMDGPU_PL_GWS);
>       ttm_range_man_fini(&adev->mman.bdev, AMDGPU_PL_OA); diff --git
> a/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c
> index cd6e4c0b94da..e970a1427fe0 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c
> @@ -719,10 +719,10 @@ void amdgpu_xcp_cfg_sysfs_fini(struct amdgpu_device
> *adev)
>       struct amdgpu_xcp_cfg *xcp_cfg;
>       int i;
>
> -     if (!adev->xcp_mgr)
> +     if (!adev->xcp_mgr || !adev->xcp_mgr->xcp_cfg)
>               return;
>
> -     xcp_cfg =  adev->xcp_mgr->xcp_cfg;
> +     xcp_cfg = adev->xcp_mgr->xcp_cfg;
>       for (i = 0; i < xcp_cfg->num_res; i++) {
>               xcp_res = &xcp_cfg->xcp_res[i];
>               kobject_put(&xcp_res->kobj);
> --
> 2.34.1



More information about the amd-gfx mailing list