[PATCH V2 1/2] drm/amdgpu: Fixed the defect of soft lock caused by infinite loop
yipechai
YiPeng.Chai at amd.com
Sun Jan 30 07:11:39 UTC 2022
1. The infinite loop case only occurs on multiple cards support
ras functions.
2. The explanation of root cause refer to commit 76641cbbf196
("drm/amdgpu: Add judgement to avoid infinite loop").
3. Create new node to manage each unique ras instance to guarantee
each device .ras_list is completely independent.
4. Fixes: commit 7a6b8ab3231b51 ("drm/amdgpu: Unify ras block
interface for each ras block").
5. The soft locked logs are as follows:
[ 262.165690] CPU: 93 PID: 758 Comm: kworker/93:1 Tainted: G OE 5.13.0-27-generic #29~20.04.1-Ubuntu
[ 262.165695] Hardware name: Supermicro AS -4124GS-TNR/H12DSG-O-CPU, BIOS T20200717143848 07/17/2020
[ 262.165698] Workqueue: events amdgpu_ras_do_recovery [amdgpu]
[ 262.165980] RIP: 0010:amdgpu_ras_get_ras_block+0x86/0xd0 [amdgpu]
[ 262.166239] Code: 68 d8 4c 8d 71 d8 48 39 c3 74 54 49 8b 45 38 48 85 c0 74 32 44 89 fa 44 89 e6 4c 89 ef e8 82 e4 9b dc 85 c0 74 3c 49 8b 46 28 <49> 8d 56 28 4d 89 f5 48 83 e8 28 48 39 d3 74 25 49 89 c6 49 8b 45
[ 262.166243] RSP: 0018:ffffac908fa87d80 EFLAGS: 00000202
[ 262.166247] RAX: ffffffffc1394248 RBX: ffff91e4ab8d6e20 RCX: ffffffffc1394248
[ 262.166249] RDX: ffff91e4aa356e20 RSI: 000000000000000e RDI: ffff91e4ab8c0000
[ 262.166252] RBP: ffffac908fa87da8 R08: 0000000000000007 R09: 0000000000000001
[ 262.166254] R10: ffff91e4930b64ec R11: 0000000000000000 R12: 000000000000000e
[ 262.166256] R13: ffff91e4aa356df8 R14: ffffffffc1394320 R15: 0000000000000003
[ 262.166258] FS: 0000000000000000(0000) GS:ffff92238fb40000(0000) knlGS:0000000000000000
[ 262.166261] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 262.166264] CR2: 00000001004865d0 CR3: 000000406d796000 CR4: 0000000000350ee0
[ 262.166267] Call Trace:
[ 262.166272] amdgpu_ras_do_recovery+0x130/0x290 [amdgpu]
[ 262.166529] ? psi_task_switch+0xd2/0x250
[ 262.166537] ? __switch_to+0x11d/0x460
[ 262.166542] ? __switch_to_asm+0x36/0x70
[ 262.166549] process_one_work+0x220/0x3c0
[ 262.166556] worker_thread+0x4d/0x3f0
[ 262.166560] ? process_one_work+0x3c0/0x3c0
[ 262.166563] kthread+0x12b/0x150
[ 262.166568] ? set_kthread_struct+0x40/0x40
[ 262.166571] ret_from_fork+0x22/0x30
Signed-off-by: yipechai <YiPeng.Chai at amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 37 ++++++++++++++++++++++---
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 3 --
2 files changed, 33 insertions(+), 7 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 9d7c778c1a2d..9b94c9c4960c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -75,6 +75,13 @@ const char *ras_mca_block_string[] = {
"mca_iohc",
};
+struct amdgpu_ras_block_list {
+ /* ras block link */
+ struct list_head node;
+
+ struct amdgpu_ras_block_object *ras_obj;
+};
+
const char *get_ras_block_str(struct ras_common_if *ras_block)
{
if (!ras_block)
@@ -880,7 +887,8 @@ static struct amdgpu_ras_block_object *amdgpu_ras_get_ras_block(struct amdgpu_de
enum amdgpu_ras_block block, uint32_t sub_block_index)
{
int loop_cnt = 0;
- struct amdgpu_ras_block_object *obj, *tmp;
+ struct amdgpu_ras_block_list *node, *tmp;
+ struct amdgpu_ras_block_object *obj;
if (block >= AMDGPU_RAS_BLOCK__LAST)
return NULL;
@@ -888,7 +896,13 @@ static struct amdgpu_ras_block_object *amdgpu_ras_get_ras_block(struct amdgpu_de
if (!amdgpu_ras_is_supported(adev, block))
return NULL;
- list_for_each_entry_safe(obj, tmp, &adev->ras_list, node) {
+ list_for_each_entry_safe(node, tmp, &adev->ras_list, node) {
+ if (!node->ras_obj) {
+ dev_warn(adev->dev, "Warning: abnormal ras list node.\n");
+ continue;
+ }
+
+ obj = node->ras_obj;
if (obj->ras_block_match) {
if (obj->ras_block_match(obj, block, sub_block_index) == 0)
return obj;
@@ -2527,6 +2541,7 @@ int amdgpu_ras_pre_fini(struct amdgpu_device *adev)
int amdgpu_ras_fini(struct amdgpu_device *adev)
{
+ struct amdgpu_ras_block_list *ras_node, *tmp;
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
if (!adev->ras_enabled || !con)
@@ -2545,6 +2560,12 @@ int amdgpu_ras_fini(struct amdgpu_device *adev)
amdgpu_ras_set_context(adev, NULL);
kfree(con);
+ /* Clear ras blocks from ras_list and free ras block list node */
+ list_for_each_entry_safe(ras_node, tmp, &adev->ras_list, node) {
+ list_del(&ras_node->node);
+ kfree(ras_node);
+ }
+
return 0;
}
@@ -2754,14 +2775,22 @@ int amdgpu_ras_reset_gpu(struct amdgpu_device *adev)
int amdgpu_ras_register_ras_block(struct amdgpu_device *adev,
struct amdgpu_ras_block_object *ras_block_obj)
{
+ struct amdgpu_ras_block_list *ras_node;
if (!adev || !ras_block_obj)
return -EINVAL;
if (!amdgpu_ras_asic_supported(adev))
return 0;
- INIT_LIST_HEAD(&ras_block_obj->node);
- list_add_tail(&ras_block_obj->node, &adev->ras_list);
+ ras_node = kzalloc(sizeof(*ras_node), GFP_KERNEL);
+ if (!ras_node) {
+ dev_err(adev->dev, "Failed to allocate ras_node!\n");
+ return -ENOMEM;
+ }
+
+ INIT_LIST_HEAD(&ras_node->node);
+ ras_node->ras_obj = ras_block_obj;
+ list_add_tail(&ras_node->node, &adev->ras_list);
return 0;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index a51a281bd91a..a55743b12d57 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -493,9 +493,6 @@ struct amdgpu_ras_block_object {
uint32_t sub_block_index;
- /* ras block link */
- struct list_head node;
-
int (*ras_block_match)(struct amdgpu_ras_block_object *block_obj,
enum amdgpu_ras_block block, uint32_t sub_block_index);
int (*ras_late_init)(struct amdgpu_device *adev, void *ras_info);
--
2.25.1
More information about the amd-gfx
mailing list