[PATCH 3/3] drm/amdgpu: make reset method configurable for RAS poison
Zhou1, Tao
Tao.Zhou1 at amd.com
Mon Mar 18 03:10:44 UTC 2024
[AMD Official Use Only - General]
I can remove the support for SOC15_IH_CLIENTID_VMC from v10, but the reset type should be changed from bool to uint32 for all versions.
Regards,
Tao
> -----Original Message-----
> From: Zhang, Hawking <Hawking.Zhang at amd.com>
> Sent: Sunday, March 17, 2024 6:10 PM
> To: Zhou1, Tao <Tao.Zhou1 at amd.com>; amd-gfx at lists.freedesktop.org
> Cc: Zhou1, Tao <Tao.Zhou1 at amd.com>
> Subject: RE: [PATCH 3/3] drm/amdgpu: make reset method configurable for RAS
> poison
>
> [AMD Official Use Only - General]
>
> Let's not copy kfd interrupt handler and the work queue implementation from v9
> to v10 since the firmware/hardware design are totally different.
>
> We shall have another patch to fix kfd int v10 for poison consumption handling
> and also v11.
>
> Regards,
> Hawking
>
> -----Original Message-----
> From: amd-gfx <amd-gfx-bounces at lists.freedesktop.org> On Behalf Of Tao Zhou
> Sent: Wednesday, March 13, 2024 17:12
> To: amd-gfx at lists.freedesktop.org
> Cc: Zhou1, Tao <Tao.Zhou1 at amd.com>
> Subject: [PATCH 3/3] drm/amdgpu: make reset method configurable for RAS
> poison
>
> Each RAS block has different requirement for gpu reset in poison consumption
> handling.
> Add support for mmhub RAS poison consumption handling.
>
> Signed-off-by: Tao Zhou <tao.zhou1 at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 2 +-
> drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 2 +-
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 4 ++--
> drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 14 ++++++-------
> drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h | 4 ++--
> .../gpu/drm/amd/amdkfd/kfd_int_process_v10.c | 20 ++++++++++++++-----
> .../gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 20 ++++++++++++++-----
> 7 files changed, 42 insertions(+), 24 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> index 9687650b0fe3..262d20167039 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> @@ -760,7 +760,7 @@ bool amdgpu_amdkfd_is_fed(struct amdgpu_device
> *adev) }
>
> void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device
> *adev,
> - enum amdgpu_ras_block block, bool reset)
> + enum amdgpu_ras_block block, uint32_t reset)
> {
> amdgpu_umc_poison_handler(adev, block, reset); } diff --git
> a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> index 03bf20e0e3da..ad50c7bbc326 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> @@ -400,7 +400,7 @@ void amdgpu_amdkfd_debug_mem_fence(struct
> amdgpu_device *adev); int amdgpu_amdkfd_get_tile_config(struct
> amdgpu_device *adev,
> struct tile_config *config); void
> amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device
> *adev,
> - enum amdgpu_ras_block block, bool reset);
> + enum amdgpu_ras_block block, uint32_t reset);
> bool amdgpu_amdkfd_is_fed(struct amdgpu_device *adev); bool
> amdgpu_amdkfd_bo_mapped_to_dev(struct amdgpu_device *adev, struct
> kgd_mem *mem); void amdgpu_amdkfd_block_mmu_notifications(void *p); diff
> --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index e32a186c2de1..58fe7bebdf1b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -2045,7 +2045,7 @@ static void
> amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *
> }
> }
>
> - amdgpu_umc_poison_handler(adev, obj->head.block, false);
> + amdgpu_umc_poison_handler(adev, obj->head.block, 0);
>
> if (block_obj->hw_ops && block_obj->hw_ops->handle_poison_consumption)
> poison_stat = block_obj->hw_ops->handle_poison_consumption(adev);
> @@ -2698,7 +2698,7 @@ static int amdgpu_ras_page_retirement_thread(void
> *param)
> atomic_dec(&con->page_retirement_req_cnt);
>
> amdgpu_umc_bad_page_polling_timeout(adev,
> - false, MAX_UMC_POISON_POLLING_TIME_ASYNC);
> + 0, MAX_UMC_POISON_POLLING_TIME_ASYNC);
> }
>
> return 0;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
> index 20436f81856a..2c02585dcbff 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
> @@ -186,9 +186,7 @@ static int amdgpu_umc_do_page_retirement(struct
> amdgpu_device *adev,
> amdgpu_umc_handle_bad_pages(adev, ras_error_status);
>
> if (err_data->ue_count && reset) {
> - /* use mode-2 reset for poison consumption */
> - if (!entry)
> - con->gpu_reset_flags |=
> AMDGPU_RAS_GPU_RESET_MODE2_RESET;
> + con->gpu_reset_flags |= reset;
> amdgpu_ras_reset_gpu(adev);
> }
>
> @@ -196,7 +194,7 @@ static int amdgpu_umc_do_page_retirement(struct
> amdgpu_device *adev, }
>
> int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev,
> - bool reset, uint32_t timeout_ms)
> + uint32_t reset, uint32_t timeout_ms)
> {
> struct ras_err_data err_data;
> struct ras_common_if head = {
> @@ -238,8 +236,7 @@ int amdgpu_umc_bad_page_polling_timeout(struct
> amdgpu_device *adev,
> if (reset) {
> struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
>
> - /* use mode-2 reset for poison consumption */
> - con->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE2_RESET;
> + con->gpu_reset_flags |= reset;
> amdgpu_ras_reset_gpu(adev);
> }
>
> @@ -247,7 +244,7 @@ int amdgpu_umc_bad_page_polling_timeout(struct
> amdgpu_device *adev, }
>
> int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
> - enum amdgpu_ras_block block, bool reset)
> + enum amdgpu_ras_block block, uint32_t reset)
> {
> int ret = AMDGPU_RAS_SUCCESS;
>
> @@ -311,7 +308,8 @@ int amdgpu_umc_process_ras_data_cb(struct
> amdgpu_device *adev,
> void *ras_error_status,
> struct amdgpu_iv_entry *entry) {
> - return amdgpu_umc_do_page_retirement(adev, ras_error_status, entry,
> true);
> + return amdgpu_umc_do_page_retirement(adev, ras_error_status, entry,
> + AMDGPU_RAS_GPU_RESET_MODE1_RESET);
> }
>
> int amdgpu_umc_ras_sw_init(struct amdgpu_device *adev) diff --git
> a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
> index 26d2ae498daf..4365a20eeb49 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
> @@ -103,7 +103,7 @@ struct amdgpu_umc {
> int amdgpu_umc_ras_sw_init(struct amdgpu_device *adev); int
> amdgpu_umc_ras_late_init(struct amdgpu_device *adev, struct ras_common_if
> *ras_block); int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
> - enum amdgpu_ras_block block, bool reset);
> + enum amdgpu_ras_block block, uint32_t reset);
> int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev,
> struct amdgpu_irq_src *source,
> struct amdgpu_iv_entry *entry); @@ -123,5 +123,5 @@ int
> amdgpu_umc_loop_channels(struct amdgpu_device *adev,
> umc_func func, void *data);
>
> int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev,
> - bool reset, uint32_t timeout_ms);
> + uint32_t reset, uint32_t timeout_ms);
> #endif
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c
> index 650da18b0d87..94ab1f33fc4a 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c
> @@ -134,6 +134,7 @@ static void event_interrupt_poison_consumption(struct
> kfd_node *dev, {
> enum amdgpu_ras_block block = 0;
> int old_poison, ret = -EINVAL;
> + uint32_t reset = 0;
> struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
>
> if (!p)
> @@ -153,6 +154,15 @@ static void event_interrupt_poison_consumption(struct
> kfd_node *dev,
> case SOC15_IH_CLIENTID_UTCL2:
> ret = kfd_dqm_evict_pasid(dev->dqm, pasid);
> block = AMDGPU_RAS_BLOCK__GFX;
> + if (ret)
> + reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
> + break;
> + case SOC15_IH_CLIENTID_VMC:
> + case SOC15_IH_CLIENTID_VMC1:
> + ret = kfd_dqm_evict_pasid(dev->dqm, pasid);
> + block = AMDGPU_RAS_BLOCK__MMHUB;
> + if (ret)
> + reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET;
> break;
> case SOC15_IH_CLIENTID_SDMA0:
> case SOC15_IH_CLIENTID_SDMA1:
> @@ -160,6 +170,7 @@ static void event_interrupt_poison_consumption(struct
> kfd_node *dev,
> case SOC15_IH_CLIENTID_SDMA3:
> case SOC15_IH_CLIENTID_SDMA4:
> block = AMDGPU_RAS_BLOCK__SDMA;
> + reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
> break;
> default:
> break;
> @@ -170,17 +181,16 @@ static void event_interrupt_poison_consumption(struct
> kfd_node *dev,
> /* resetting queue passes, do page retirement without gpu reset
> * resetting queue fails, fallback to gpu reset solution
> */
> - if (!ret) {
> + if (!ret)
> dev_warn(dev->adev->dev,
> "RAS poison consumption, unmap queue flow succeeded: client
> id %d\n",
> client_id);
> - amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block,
> false);
> - } else {
> + else
> dev_warn(dev->adev->dev,
> "RAS poison consumption, fall back to gpu reset flow: client
> id %d\n",
> client_id);
> - amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block,
> true);
> - }
> +
> + amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block,
> + reset);
> }
>
> static bool event_interrupt_isr_v10(struct kfd_node *dev, diff --git
> a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
> index 11641f4645e6..2a37ab7a7150 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
> @@ -145,6 +145,7 @@ static void
> event_interrupt_poison_consumption_v9(struct kfd_node *dev, {
> enum amdgpu_ras_block block = 0;
> int old_poison, ret = -EINVAL;
> + uint32_t reset = 0;
> struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
>
> if (!p)
> @@ -164,6 +165,15 @@ static void
> event_interrupt_poison_consumption_v9(struct kfd_node *dev,
> case SOC15_IH_CLIENTID_UTCL2:
> ret = kfd_dqm_evict_pasid(dev->dqm, pasid);
> block = AMDGPU_RAS_BLOCK__GFX;
> + if (ret)
> + reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
> + break;
> + case SOC15_IH_CLIENTID_VMC:
> + case SOC15_IH_CLIENTID_VMC1:
> + ret = kfd_dqm_evict_pasid(dev->dqm, pasid);
> + block = AMDGPU_RAS_BLOCK__MMHUB;
> + if (ret)
> + reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET;
> break;
> case SOC15_IH_CLIENTID_SDMA0:
> case SOC15_IH_CLIENTID_SDMA1:
> @@ -171,6 +181,7 @@ static void
> event_interrupt_poison_consumption_v9(struct kfd_node *dev,
> case SOC15_IH_CLIENTID_SDMA3:
> case SOC15_IH_CLIENTID_SDMA4:
> block = AMDGPU_RAS_BLOCK__SDMA;
> + reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
> break;
> default:
> break;
> @@ -181,17 +192,16 @@ static void
> event_interrupt_poison_consumption_v9(struct kfd_node *dev,
> /* resetting queue passes, do page retirement without gpu reset
> * resetting queue fails, fallback to gpu reset solution
> */
> - if (!ret) {
> + if (!ret)
> dev_warn(dev->adev->dev,
> "RAS poison consumption, unmap queue flow succeeded: client
> id %d\n",
> client_id);
> - amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block,
> false);
> - } else {
> + else
> dev_warn(dev->adev->dev,
> "RAS poison consumption, fall back to gpu reset flow: client
> id %d\n",
> client_id);
> - amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block,
> true);
> - }
> +
> + amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block,
> + reset);
> }
>
> static bool context_id_expected(struct kfd_dev *dev)
> --
> 2.34.1
>
More information about the amd-gfx
mailing list