[PATCH] drm/amdgpu: Add fatal error handling in nbio v4_3

Thu Mar 23 03:24:42 UTC 2023

[AMD Official Use Only - General]

Reviewed-by: Tao Zhou <tao.zhou1 at amd.com>

> -----Original Message-----
> From: Zhang, Hawking <Hawking.Zhang at amd.com>
> Sent: Thursday, March 23, 2023 10:24 AM
> To: amd-gfx at lists.freedesktop.org; Zhou1, Tao <Tao.Zhou1 at amd.com>; Yang,
> Stanley <Stanley.Yang at amd.com>; Li, Candice <Candice.Li at amd.com>; Chai,
> Thomas <YiPeng.Chai at amd.com>
> Cc: Zhang, Hawking <Hawking.Zhang at amd.com>
> Subject: [PATCH] drm/amdgpu: Add fatal error handling in nbio v4_3
> 
> GPU will stop working once fatal error is detected.
> it will inform driver to do reset to recover from the fatal error.
> 
> Signed-off-by: Hawking Zhang <Hawking.Zhang at amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 11 ++++
> drivers/gpu/drm/amd/amdgpu/nbio_v4_3.c  | 79 +++++++++++++++++++++++++
> drivers/gpu/drm/amd/amdgpu/nbio_v4_3.h  |  1 +
>  drivers/gpu/drm/amd/amdgpu/soc21.c      | 15 ++++-
>  4 files changed, 105 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index c6dc3cd2a9de..5b1779021881 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -34,6 +34,7 @@
>  #include "amdgpu_atomfirmware.h"
>  #include "amdgpu_xgmi.h"
>  #include "ivsrcid/nbio/irqsrcs_nbif_7_4.h"
> +#include "nbio_v4_3.h"
>  #include "atom.h"
>  #include "amdgpu_reset.h"
> 
> @@ -2562,6 +2563,16 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
>  		if (!adev->gmc.xgmi.connected_to_cpu)
>  			adev->nbio.ras = &nbio_v7_4_ras;
>  		break;
> +	case IP_VERSION(4, 3, 0):
> +		if (adev->ras_hw_enabled | AMDGPU_RAS_BLOCK__DF)
> +			/* unlike other generation of nbio ras,
> +			 * nbio v4_3 only support fatal error interrupt
> +			 * to inform software that DF is freezed due to
> +			 * system fatal error event. driver should not
> +			 * enable nbio ras in such case. Instead,
> +			 * check DF RAS */
> +			adev->nbio.ras = &nbio_v4_3_ras;
> +		break;
>  	default:
>  		/* nbio ras is not available */
>  		break;
> diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v4_3.c
> b/drivers/gpu/drm/amd/amdgpu/nbio_v4_3.c
> index 09fdcd20cb91..d5ed9e0e1a5f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/nbio_v4_3.c
> +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v4_3.c
> @@ -26,6 +26,7 @@
> 
>  #include "nbio/nbio_4_3_0_offset.h"
>  #include "nbio/nbio_4_3_0_sh_mask.h"
> +#include "ivsrcid/nbio/irqsrcs_nbif_7_4.h"
>  #include <uapi/linux/kfd_ioctl.h>
> 
>  static void nbio_v4_3_remap_hdp_registers(struct amdgpu_device *adev) @@ -
> 538,3 +539,81 @@ const struct amdgpu_nbio_funcs nbio_v4_3_sriov_funcs = {
>  	.remap_hdp_registers = nbio_v4_3_remap_hdp_registers,
>  	.get_rom_offset = nbio_v4_3_get_rom_offset,  };
> +
> +static int nbio_v4_3_set_ras_err_event_athub_irq_state(struct amdgpu_device
> *adev,
> +						       struct amdgpu_irq_src *src,
> +						       unsigned type,
> +						       enum
> amdgpu_interrupt_state state) {
> +	/* The ras_controller_irq enablement should be done in psp bl when it
> +	 * tries to enable ras feature. Driver only need to set the correct
> interrupt
> +	 * vector for bare-metal and sriov use case respectively
> +	 */
> +	uint32_t bif_doorbell_int_cntl;
> +
> +	bif_doorbell_int_cntl = RREG32_SOC15(NBIO, 0,
> regBIF_BX0_BIF_DOORBELL_INT_CNTL);
> +	bif_doorbell_int_cntl = REG_SET_FIELD(bif_doorbell_int_cntl,
> +					      BIF_BX0_BIF_DOORBELL_INT_CNTL,
> +
> RAS_ATHUB_ERR_EVENT_INTERRUPT_DISABLE,
> +					      (state ==
> AMDGPU_IRQ_STATE_ENABLE) ? 0 : 1);
> +	WREG32_SOC15(NBIO, 0, regBIF_BX0_BIF_DOORBELL_INT_CNTL,
> +bif_doorbell_int_cntl);
> +
> +	return 0;
> +}
> +
> +static int nbio_v4_3_process_err_event_athub_irq(struct amdgpu_device
> *adev,
> +						 struct amdgpu_irq_src
> *source,
> +						 struct amdgpu_iv_entry *entry)
> +{
> +	/* By design, the ih cookie for err_event_athub_irq should be written
> +	 * to bif ring. since bif ring is not enabled, just leave process callback
> +	 * as a dummy one.
> +	 */
> +	return 0;
> +}
> +
> +static const struct amdgpu_irq_src_funcs
> nbio_v4_3_ras_err_event_athub_irq_funcs = {
> +	.set = nbio_v4_3_set_ras_err_event_athub_irq_state,
> +	.process = nbio_v4_3_process_err_event_athub_irq,
> +};
> +
> +static void nbio_v4_3_handle_ras_err_event_athub_intr_no_bifring(struct
> +amdgpu_device *adev) {
> +	uint32_t bif_doorbell_int_cntl;
> +
> +	bif_doorbell_int_cntl = RREG32_SOC15(NBIO, 0,
> regBIF_BX0_BIF_DOORBELL_INT_CNTL);
> +	if (REG_GET_FIELD(bif_doorbell_int_cntl,
> +			  BIF_DOORBELL_INT_CNTL,
> +			  RAS_ATHUB_ERR_EVENT_INTERRUPT_STATUS)) {
> +		/* driver has to clear the interrupt status when bif ring is
> disabled */
> +		bif_doorbell_int_cntl = REG_SET_FIELD(bif_doorbell_int_cntl,
> +						BIF_DOORBELL_INT_CNTL,
> +
> 	RAS_ATHUB_ERR_EVENT_INTERRUPT_CLEAR, 1);
> +		WREG32_SOC15(NBIO, 0,
> regBIF_BX0_BIF_DOORBELL_INT_CNTL, bif_doorbell_int_cntl);
> +		amdgpu_ras_global_ras_isr(adev);
> +	}
> +}
> +
> +static int nbio_v4_3_init_ras_err_event_athub_interrupt(struct
> +amdgpu_device *adev) {
> +
> +	int r;
> +
> +	/* init the irq funcs */
> +	adev->nbio.ras_err_event_athub_irq.funcs =
> +		&nbio_v4_3_ras_err_event_athub_irq_funcs;
> +	adev->nbio.ras_err_event_athub_irq.num_types = 1;
> +
> +	/* register ras err event athub interrupt
> +	 * nbio v4_3 uses the same irq source as nbio v7_4 */
> +	r = amdgpu_irq_add_id(adev, SOC21_IH_CLIENTID_BIF,
> +			      NBIF_7_4__SRCID__ERREVENT_ATHUB_INTERRUPT,
> +			      &adev->nbio.ras_err_event_athub_irq);
> +
> +	return r;
> +}
> +
> +struct amdgpu_nbio_ras nbio_v4_3_ras = {
> +	.handle_ras_err_event_athub_intr_no_bifring =
> nbio_v4_3_handle_ras_err_event_athub_intr_no_bifring,
> +	.init_ras_err_event_athub_interrupt =
> +nbio_v4_3_init_ras_err_event_athub_interrupt,
> +};
> diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v4_3.h
> b/drivers/gpu/drm/amd/amdgpu/nbio_v4_3.h
> index 711999ceedf4..399037cdf4fb 100644
> --- a/drivers/gpu/drm/amd/amdgpu/nbio_v4_3.h
> +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v4_3.h
> @@ -29,5 +29,6 @@
>  extern const struct nbio_hdp_flush_reg nbio_v4_3_hdp_flush_reg;  extern const
> struct amdgpu_nbio_funcs nbio_v4_3_funcs;  extern const struct
> amdgpu_nbio_funcs nbio_v4_3_sriov_funcs;
> +extern struct amdgpu_nbio_ras nbio_v4_3_ras;
> 
>  #endif
> diff --git a/drivers/gpu/drm/amd/amdgpu/soc21.c
> b/drivers/gpu/drm/amd/amdgpu/soc21.c
> index 67580761b44d..514bfc705d5a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/soc21.c
> +++ b/drivers/gpu/drm/amd/amdgpu/soc21.c
> @@ -754,6 +754,14 @@ static int soc21_common_late_init(void *handle)
> 
> sriov_vcn_4_0_0_video_codecs_decode_array_vcn0,
> 
> ARRAY_SIZE(sriov_vcn_4_0_0_video_codecs_decode_array_vcn0));
>  		}
> +	} else {
> +		if (adev->nbio.ras &&
> +		    adev->nbio.ras_err_event_athub_irq.funcs)
> +			/* don't need to fail gpu late init
> +			 * if enabling athub_err_event interrupt failed
> +			 * nbio v4_3 only support fatal error hanlding
> +			 * just enable the interrupt directly */
> +			amdgpu_irq_get(adev, &adev-
> >nbio.ras_err_event_athub_irq, 0);
>  	}
> 
>  	return 0;
> @@ -801,8 +809,13 @@ static int soc21_common_hw_fini(void *handle)
>  	/* disable the doorbell aperture */
>  	soc21_enable_doorbell_aperture(adev, false);
> 
> -	if (amdgpu_sriov_vf(adev))
> +	if (amdgpu_sriov_vf(adev)) {
>  		xgpu_nv_mailbox_put_irq(adev);
> +	} else {
> +		if (adev->nbio.ras &&
> +		    adev->nbio.ras_err_event_athub_irq.funcs)
> +			amdgpu_irq_put(adev, &adev-
> >nbio.ras_err_event_athub_irq, 0);
> +	}
> 
>  	return 0;
>  }
> --
> 2.17.1