[PATCH 3/3] drm/amdgpu: make reset method configurable for RAS poison
Zhang, Hawking
Hawking.Zhang at amd.com
Mon Mar 18 08:00:16 UTC 2024
[AMD Official Use Only - General]
Series is
Reviewed-by: Hawking Zhang <Hawking.Zhang at amd.com>
Regards,
Hawking
-----Original Message-----
From: amd-gfx <amd-gfx-bounces at lists.freedesktop.org> On Behalf Of Tao Zhou
Sent: Monday, March 18, 2024 15:26
To: amd-gfx at lists.freedesktop.org
Cc: Zhou1, Tao <Tao.Zhou1 at amd.com>
Subject: [PATCH 3/3] drm/amdgpu: make reset method configurable for RAS poison
Each RAS block has different requirement for gpu reset in poison consumption handling.
Add support for mmhub RAS poison consumption handling.
v2: remove the mmhub poison support for kfd int v10.
Signed-off-by: Tao Zhou <tao.zhou1 at amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 2 +-
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 2 +-
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 4 ++--
drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 14 ++++++-------
drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h | 4 ++--
.../gpu/drm/amd/amdkfd/kfd_int_process_v10.c | 13 +++++++----- .../gpu/drm/amd/amdkfd/kfd_int_process_v11.c | 9 +++++----
.../gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 20 ++++++++++++++-----
8 files changed, 40 insertions(+), 28 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 0b4910108f61..66753940bb4d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -760,7 +760,7 @@ bool amdgpu_amdkfd_is_fed(struct amdgpu_device *adev) }
void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev,
- enum amdgpu_ras_block block, bool reset)
+ enum amdgpu_ras_block block, uint32_t reset)
{
amdgpu_umc_poison_handler(adev, block, reset); } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index 03bf20e0e3da..ad50c7bbc326 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -400,7 +400,7 @@ void amdgpu_amdkfd_debug_mem_fence(struct amdgpu_device *adev); int amdgpu_amdkfd_get_tile_config(struct amdgpu_device *adev,
struct tile_config *config);
void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev,
- enum amdgpu_ras_block block, bool reset);
+ enum amdgpu_ras_block block, uint32_t reset);
bool amdgpu_amdkfd_is_fed(struct amdgpu_device *adev); bool amdgpu_amdkfd_bo_mapped_to_dev(struct amdgpu_device *adev, struct kgd_mem *mem); void amdgpu_amdkfd_block_mmu_notifications(void *p); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index e32a186c2de1..58fe7bebdf1b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2045,7 +2045,7 @@ static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *
}
}
- amdgpu_umc_poison_handler(adev, obj->head.block, false);
+ amdgpu_umc_poison_handler(adev, obj->head.block, 0);
if (block_obj->hw_ops && block_obj->hw_ops->handle_poison_consumption)
poison_stat = block_obj->hw_ops->handle_poison_consumption(adev);
@@ -2698,7 +2698,7 @@ static int amdgpu_ras_page_retirement_thread(void *param)
atomic_dec(&con->page_retirement_req_cnt);
amdgpu_umc_bad_page_polling_timeout(adev,
- false, MAX_UMC_POISON_POLLING_TIME_ASYNC);
+ 0, MAX_UMC_POISON_POLLING_TIME_ASYNC);
}
return 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
index 20436f81856a..2c02585dcbff 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -186,9 +186,7 @@ static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
amdgpu_umc_handle_bad_pages(adev, ras_error_status);
if (err_data->ue_count && reset) {
- /* use mode-2 reset for poison consumption */
- if (!entry)
- con->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE2_RESET;
+ con->gpu_reset_flags |= reset;
amdgpu_ras_reset_gpu(adev);
}
@@ -196,7 +194,7 @@ static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev, }
int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev,
- bool reset, uint32_t timeout_ms)
+ uint32_t reset, uint32_t timeout_ms)
{
struct ras_err_data err_data;
struct ras_common_if head = {
@@ -238,8 +236,7 @@ int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev,
if (reset) {
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
- /* use mode-2 reset for poison consumption */
- con->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE2_RESET;
+ con->gpu_reset_flags |= reset;
amdgpu_ras_reset_gpu(adev);
}
@@ -247,7 +244,7 @@ int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev, }
int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
- enum amdgpu_ras_block block, bool reset)
+ enum amdgpu_ras_block block, uint32_t reset)
{
int ret = AMDGPU_RAS_SUCCESS;
@@ -311,7 +308,8 @@ int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev,
void *ras_error_status,
struct amdgpu_iv_entry *entry)
{
- return amdgpu_umc_do_page_retirement(adev, ras_error_status, entry, true);
+ return amdgpu_umc_do_page_retirement(adev, ras_error_status, entry,
+ AMDGPU_RAS_GPU_RESET_MODE1_RESET);
}
int amdgpu_umc_ras_sw_init(struct amdgpu_device *adev) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
index 26d2ae498daf..4365a20eeb49 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
@@ -103,7 +103,7 @@ struct amdgpu_umc {
int amdgpu_umc_ras_sw_init(struct amdgpu_device *adev); int amdgpu_umc_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block); int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
- enum amdgpu_ras_block block, bool reset);
+ enum amdgpu_ras_block block, uint32_t reset);
int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev,
struct amdgpu_irq_src *source,
struct amdgpu_iv_entry *entry);
@@ -123,5 +123,5 @@ int amdgpu_umc_loop_channels(struct amdgpu_device *adev,
umc_func func, void *data);
int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev,
- bool reset, uint32_t timeout_ms);
+ uint32_t reset, uint32_t timeout_ms);
#endif
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c
index 650da18b0d87..740f89aafbc0 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c
@@ -134,6 +134,7 @@ static void event_interrupt_poison_consumption(struct kfd_node *dev, {
enum amdgpu_ras_block block = 0;
int old_poison, ret = -EINVAL;
+ uint32_t reset = 0;
struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
if (!p)
@@ -153,6 +154,8 @@ static void event_interrupt_poison_consumption(struct kfd_node *dev,
case SOC15_IH_CLIENTID_UTCL2:
ret = kfd_dqm_evict_pasid(dev->dqm, pasid);
block = AMDGPU_RAS_BLOCK__GFX;
+ if (ret)
+ reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
break;
case SOC15_IH_CLIENTID_SDMA0:
case SOC15_IH_CLIENTID_SDMA1:
@@ -160,6 +163,7 @@ static void event_interrupt_poison_consumption(struct kfd_node *dev,
case SOC15_IH_CLIENTID_SDMA3:
case SOC15_IH_CLIENTID_SDMA4:
block = AMDGPU_RAS_BLOCK__SDMA;
+ reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
break;
default:
break;
@@ -170,17 +174,16 @@ static void event_interrupt_poison_consumption(struct kfd_node *dev,
/* resetting queue passes, do page retirement without gpu reset
* resetting queue fails, fallback to gpu reset solution
*/
- if (!ret) {
+ if (!ret)
dev_warn(dev->adev->dev,
"RAS poison consumption, unmap queue flow succeeded: client id %d\n",
client_id);
- amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, false);
- } else {
+ else
dev_warn(dev->adev->dev,
"RAS poison consumption, fall back to gpu reset flow: client id %d\n",
client_id);
- amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, true);
- }
+
+ amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, reset);
}
static bool event_interrupt_isr_v10(struct kfd_node *dev, diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c
index 7e2859736a55..d3d6b5c180b3 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c
@@ -193,6 +193,7 @@ static void event_interrupt_poison_consumption_v11(struct kfd_node *dev, {
enum amdgpu_ras_block block = 0;
int ret = -EINVAL;
+ uint32_t reset = 0;
struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
if (!p)
@@ -212,10 +213,13 @@ static void event_interrupt_poison_consumption_v11(struct kfd_node *dev,
if (dev->dqm->ops.reset_queues)
ret = dev->dqm->ops.reset_queues(dev->dqm, pasid);
block = AMDGPU_RAS_BLOCK__GFX;
+ if (ret)
+ reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
break;
case SOC21_INTSRC_SDMA_ECC:
default:
block = AMDGPU_RAS_BLOCK__GFX;
+ reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
break;
}
@@ -223,10 +227,7 @@ static void event_interrupt_poison_consumption_v11(struct kfd_node *dev,
/* resetting queue passes, do page retirement without gpu reset
resetting queue fails, fallback to gpu reset solution */
- if (!ret)
- amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, false);
- else
- amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, true);
+ amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, reset);
}
static bool event_interrupt_isr_v11(struct kfd_node *dev, diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index 11641f4645e6..2a37ab7a7150 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -145,6 +145,7 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev, {
enum amdgpu_ras_block block = 0;
int old_poison, ret = -EINVAL;
+ uint32_t reset = 0;
struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
if (!p)
@@ -164,6 +165,15 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev,
case SOC15_IH_CLIENTID_UTCL2:
ret = kfd_dqm_evict_pasid(dev->dqm, pasid);
block = AMDGPU_RAS_BLOCK__GFX;
+ if (ret)
+ reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
+ break;
+ case SOC15_IH_CLIENTID_VMC:
+ case SOC15_IH_CLIENTID_VMC1:
+ ret = kfd_dqm_evict_pasid(dev->dqm, pasid);
+ block = AMDGPU_RAS_BLOCK__MMHUB;
+ if (ret)
+ reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET;
break;
case SOC15_IH_CLIENTID_SDMA0:
case SOC15_IH_CLIENTID_SDMA1:
@@ -171,6 +181,7 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev,
case SOC15_IH_CLIENTID_SDMA3:
case SOC15_IH_CLIENTID_SDMA4:
block = AMDGPU_RAS_BLOCK__SDMA;
+ reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
break;
default:
break;
@@ -181,17 +192,16 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev,
/* resetting queue passes, do page retirement without gpu reset
* resetting queue fails, fallback to gpu reset solution
*/
- if (!ret) {
+ if (!ret)
dev_warn(dev->adev->dev,
"RAS poison consumption, unmap queue flow succeeded: client id %d\n",
client_id);
- amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, false);
- } else {
+ else
dev_warn(dev->adev->dev,
"RAS poison consumption, fall back to gpu reset flow: client id %d\n",
client_id);
- amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, true);
- }
+
+ amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, reset);
}
static bool context_id_expected(struct kfd_dev *dev)
--
2.34.1
More information about the amd-gfx
mailing list