[PATCH V2 2/4] drm/amdgpu: refine poison creation interrupt handler
Zhang, Hawking
Hawking.Zhang at amd.com
Fri Jun 21 06:41:28 UTC 2024
[AMD Official Use Only - AMD Internal Distribution Only]
Reviewed-by: Hawking Zhang <Hawking.Zhang at amd.com>
Regards,
Hawking
-----Original Message-----
From: Chai, Thomas <YiPeng.Chai at amd.com>
Sent: Thursday, June 20, 2024 13:40
To: amd-gfx at lists.freedesktop.org
Cc: Zhang, Hawking <Hawking.Zhang at amd.com>; Zhou1, Tao <Tao.Zhou1 at amd.com>; Li, Candice <Candice.Li at amd.com>; Wang, Yang(Kevin) <KevinYang.Wang at amd.com>; Yang, Stanley <Stanley.Yang at amd.com>; Chai, Thomas <YiPeng.Chai at amd.com>
Subject: [PATCH V2 2/4] drm/amdgpu: refine poison creation interrupt handler
In order to apply to the case where a large number of ras poison interrupts:
1. Change to use variable to record poison creation
requests to avoid fifo full.
2. Prioritize handling poison creation requests
instead of following the order of requests
received by the driver.
Signed-off-by: YiPeng Chai <YiPeng.Chai at amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 37 ++++++++++++------------- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 1 +
2 files changed, 18 insertions(+), 20 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index f674e34037b6..308348b4644f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2106,10 +2106,8 @@ static void amdgpu_ras_interrupt_poison_creation_handler(struct ras_manager *obj
if (amdgpu_ip_version(obj->adev, UMC_HWIP, 0) >= IP_VERSION(12, 0, 0)) {
struct amdgpu_ras *con = amdgpu_ras_get_context(obj->adev);
- amdgpu_ras_put_poison_req(obj->adev,
- AMDGPU_RAS_BLOCK__UMC, 0, NULL, NULL, false);
-
atomic_inc(&con->page_retirement_req_cnt);
+ atomic_inc(&con->poison_creation_count);
wake_up(&con->page_retirement_wq);
}
@@ -2945,6 +2943,8 @@ static int amdgpu_ras_page_retirement_thread(void *param) {
struct amdgpu_device *adev = (struct amdgpu_device *)param;
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+ uint32_t poison_creation_count;
+ int ret;
struct ras_poison_msg poison_msg;
enum amdgpu_ras_block ras_block;
bool poison_creation_is_handled = false; @@ -2958,7 +2958,18 @@ static int amdgpu_ras_page_retirement_thread(void *param)
if (kthread_should_stop())
break;
- atomic_dec(&con->page_retirement_req_cnt);
+
+ do {
+ poison_creation_count = atomic_read(&con->poison_creation_count);
+ ret = amdgpu_ras_poison_creation_handler(adev, poison_creation_count);
+ if (ret == -EIO)
+ break;
+
+ if (poison_creation_count) {
+ atomic_sub(poison_creation_count, &con->poison_creation_count);
+ atomic_sub(poison_creation_count, &con->page_retirement_req_cnt);
+ }
+ } while (atomic_read(&con->poison_creation_count));
#ifdef HAVE_KFIFO_PUT_NON_POINTER
if (!amdgpu_ras_get_poison_req(adev, &poison_msg)) @@ -2969,24 +2980,8 @@ static int amdgpu_ras_page_retirement_thread(void *param)
dev_dbg(adev->dev, "Start processing ras block %s(%d)\n",
ras_block_str(ras_block), ras_block);
- if (ras_block == AMDGPU_RAS_BLOCK__UMC) {
- amdgpu_ras_poison_creation_handler(adev,
- MAX_UMC_POISON_POLLING_TIME_ASYNC);
- poison_creation_is_handled = true;
- } else {
- /* poison_creation_is_handled:
- * false: no poison creation interrupt, but it has poison
- * consumption interrupt.
- * true: It has poison creation interrupt at the beginning,
- * but it has no poison creation interrupt later.
- */
- amdgpu_ras_poison_creation_handler(adev,
- poison_creation_is_handled ?
- 0 : MAX_UMC_POISON_POLLING_TIME_ASYNC);
amdgpu_ras_poison_consumption_handler(adev, &poison_msg);
- poison_creation_is_handled = false;
- }
#else
dev_info(adev->dev, "Start processing page retirement. request:%d\n",
atomic_read(&con->page_retirement_req_cnt));
@@ -3066,6 +3061,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
mutex_init(&con->page_retirement_lock);
init_waitqueue_head(&con->page_retirement_wq);
atomic_set(&con->page_retirement_req_cnt, 0);
+ atomic_set(&con->poison_creation_count, 0);
con->page_retirement_thread =
kthread_run(amdgpu_ras_page_retirement_thread, adev, "umc_page_retirement");
if (IS_ERR(con->page_retirement_thread)) { @@ -3117,6 +3113,7 @@ static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)
kthread_stop(con->page_retirement_thread);
atomic_set(&con->page_retirement_req_cnt, 0);
+ atomic_set(&con->poison_creation_count, 0);
mutex_destroy(&con->page_rsv_lock);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 748bbac666e6..0fa1148e6642 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -532,6 +532,7 @@ struct amdgpu_ras {
wait_queue_head_t page_retirement_wq;
struct mutex page_retirement_lock;
atomic_t page_retirement_req_cnt;
+ atomic_t poison_creation_count;
struct mutex page_rsv_lock;
DECLARE_KFIFO(poison_fifo, struct ras_poison_msg, 128);
struct ras_ecc_log_info umc_ecc_log;
--
2.34.1
More information about the amd-gfx
mailing list