[PATCH] drm/amdgpu: Prefer RAS recovery for scheduler hang
Lazar, Lijo
lijo.lazar at amd.com
Wed Nov 20 05:43:34 UTC 2024
<Ping>
On 11/18/2024 11:14 AM, Lijo Lazar wrote:
> Before scheduling a recovery due to scheduler/job hang, check if a RAS
> error is detected. If so, choose RAS recovery to handle the situation. A
> scheduler/job hang could be the side effect of a RAS error. In such
> cases, it is required to go through the RAS error recovery process. A
> RAS error recovery process in certains cases also could avoid a full
> device device reset.
>
> An error state is maintained in RAS context to detect the block
> affected. Fatal Error state uses unused block id. Set the block id when
> error is detected. If the interrupt handler detected a poison error,
> it's not required to look for a fatal error. Skip fatal error checking
> in such cases.
>
> Signed-off-by: Lijo Lazar <lijo.lazar at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/aldebaran.c | 2 +
> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 15 ++++-
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 55 ++++++++++++++++++-
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 11 +++-
> .../gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 2 +
> 5 files changed, 78 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/aldebaran.c b/drivers/gpu/drm/amd/amdgpu/aldebaran.c
> index 3a588fecb0c5..6a2fd9e4f470 100644
> --- a/drivers/gpu/drm/amd/amdgpu/aldebaran.c
> +++ b/drivers/gpu/drm/amd/amdgpu/aldebaran.c
> @@ -332,6 +332,8 @@ aldebaran_mode2_restore_hwcontext(struct amdgpu_reset_control *reset_ctl,
> list_for_each_entry(tmp_adev, reset_device_list, reset_list) {
> dev_info(tmp_adev->dev,
> "GPU reset succeeded, trying to resume\n");
> + /*TBD: Ideally should clear only GFX, SDMA blocks*/
> + amdgpu_ras_clear_err_state(tmp_adev);
> r = aldebaran_mode2_restore_ip(tmp_adev);
> if (r)
> goto end;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index b3ca911e55d6..b4464f42d72a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -5165,7 +5165,7 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
> if (r)
> return r;
>
> - amdgpu_ras_set_fed(adev, false);
> + amdgpu_ras_clear_err_state(adev);
> amdgpu_irq_gpu_reset_resume_helper(adev);
>
> /* some sw clean up VF needs to do before recover */
> @@ -5460,7 +5460,7 @@ int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context)
> amdgpu_set_init_level(tmp_adev, AMDGPU_INIT_LEVEL_DEFAULT);
> if (full_reset) {
> /* post card */
> - amdgpu_ras_set_fed(tmp_adev, false);
> + amdgpu_ras_clear_err_state(tmp_adev);
> r = amdgpu_device_asic_init(tmp_adev);
> if (r) {
> dev_warn(tmp_adev->dev, "asic atom init failed!");
> @@ -5786,6 +5786,17 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
> bool audio_suspended = false;
> int retry_limit = AMDGPU_MAX_RETRY_LIMIT;
>
> + /*
> + * If it reaches here because of hang/timeout and a RAS error is
> + * detected at the same time, let RAS recovery take care of it.
> + */
> + if (amdgpu_ras_is_err_state(adev, AMDGPU_RAS_BLOCK__ANY) &&
> + reset_context->src != AMDGPU_RESET_SRC_RAS) {
> + dev_dbg(adev->dev,
> + "Gpu recovery from source: %d yielding to RAS error recovery handling",
> + reset_context->src);
> + return 0;
> + }
> /*
> * Special case: RAS triggered and full reset isn't supported
> */
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 1bc95b0cdbb8..e31b12144577 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -2156,6 +2156,16 @@ void amdgpu_ras_interrupt_fatal_error_handler(struct amdgpu_device *adev)
> /* Fatal error events are handled on host side */
> if (amdgpu_sriov_vf(adev))
> return;
> + /**
> + * If the current interrupt is caused by a non-fatal RAS error, skip
> + * check for fatal error. For fatal errors, FED status of all devices
> + * in XGMI hive gets set when the first device gets fatal error
> + * interrupt. The error gets propagated to other devices as well, so
> + * make sure to ack the interrupt regardless of FED status.
> + */
> + if (!amdgpu_ras_get_fed_status(adev) &&
> + amdgpu_ras_is_err_state(adev, AMDGPU_RAS_BLOCK__ANY))
> + return;
>
> if (adev->nbio.ras &&
> adev->nbio.ras->handle_ras_controller_intr_no_bifring)
> @@ -2185,6 +2195,7 @@ static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *
> if (ret)
> return;
>
> + amdgpu_ras_set_err_poison(adev, block_obj->ras_comm.block);
> /* both query_poison_status and handle_poison_consumption are optional,
> * but at least one of them should be implemented if we need poison
> * consumption handler
> @@ -4083,16 +4094,56 @@ bool amdgpu_ras_get_fed_status(struct amdgpu_device *adev)
> if (!ras)
> return false;
>
> - return atomic_read(&ras->fed);
> + return test_bit(AMDGPU_RAS_BLOCK__LAST, &ras->ras_err_state);
> }
>
> void amdgpu_ras_set_fed(struct amdgpu_device *adev, bool status)
> {
> struct amdgpu_ras *ras;
>
> + ras = amdgpu_ras_get_context(adev);
> + if (ras) {
> + if (status)
> + set_bit(AMDGPU_RAS_BLOCK__LAST, &ras->ras_err_state);
> + else
> + clear_bit(AMDGPU_RAS_BLOCK__LAST, &ras->ras_err_state);
> + }
> +}
> +
> +void amdgpu_ras_clear_err_state(struct amdgpu_device *adev)
> +{
> + struct amdgpu_ras *ras;
> +
> ras = amdgpu_ras_get_context(adev);
> if (ras)
> - atomic_set(&ras->fed, !!status);
> + ras->ras_err_state = 0;
> +}
> +
> +void amdgpu_ras_set_err_poison(struct amdgpu_device *adev,
> + enum amdgpu_ras_block block)
> +{
> + struct amdgpu_ras *ras;
> +
> + ras = amdgpu_ras_get_context(adev);
> + if (ras)
> + set_bit(block, &ras->ras_err_state);
> +}
> +
> +bool amdgpu_ras_is_err_state(struct amdgpu_device *adev, int block)
> +{
> + struct amdgpu_ras *ras;
> +
> + ras = amdgpu_ras_get_context(adev);
> + if (ras) {
> + if (block == AMDGPU_RAS_BLOCK__ANY)
> + return (ras->ras_err_state != 0);
> + else
> + return test_bit(block, &ras->ras_err_state) ||
> + test_bit(AMDGPU_RAS_BLOCK__LAST,
> + &ras->ras_err_state);
> + }
> +
> + return false;
> }
>
> static struct ras_event_manager *__get_ras_event_mgr(struct amdgpu_device *adev)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> index 6db772ecfee4..b13debcf48ee 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> @@ -99,7 +99,8 @@ enum amdgpu_ras_block {
> AMDGPU_RAS_BLOCK__IH,
> AMDGPU_RAS_BLOCK__MPIO,
>
> - AMDGPU_RAS_BLOCK__LAST
> + AMDGPU_RAS_BLOCK__LAST,
> + AMDGPU_RAS_BLOCK__ANY = -1
> };
>
> enum amdgpu_ras_mca_block {
> @@ -558,8 +559,8 @@ struct amdgpu_ras {
> struct ras_ecc_log_info umc_ecc_log;
> struct delayed_work page_retirement_dwork;
>
> - /* Fatal error detected flag */
> - atomic_t fed;
> + /* ras errors detected */
> + unsigned long ras_err_state;
>
> /* RAS event manager */
> struct ras_event_manager __event_mgr;
> @@ -952,6 +953,10 @@ ssize_t amdgpu_ras_aca_sysfs_read(struct device *dev, struct device_attribute *a
>
> void amdgpu_ras_set_fed(struct amdgpu_device *adev, bool status);
> bool amdgpu_ras_get_fed_status(struct amdgpu_device *adev);
> +void amdgpu_ras_set_err_poison(struct amdgpu_device *adev,
> + enum amdgpu_ras_block block);
> +void amdgpu_ras_clear_err_state(struct amdgpu_device *adev);
> +bool amdgpu_ras_is_err_state(struct amdgpu_device *adev, int block);
>
> u64 amdgpu_ras_acquire_event_id(struct amdgpu_device *adev, enum ras_event_type type);
> int amdgpu_ras_mark_ras_event_caller(struct amdgpu_device *adev, enum ras_event_type type,
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
> index d46a13156ee9..0cb5c582ce7d 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
> @@ -184,6 +184,7 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev,
> } else {
> reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
> }
> + amdgpu_ras_set_err_poison(dev->adev, AMDGPU_RAS_BLOCK__GFX);
> break;
> case SOC15_IH_CLIENTID_VMC:
> case SOC15_IH_CLIENTID_VMC1:
> @@ -213,6 +214,7 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev,
> } else {
> reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
> }
> + amdgpu_ras_set_err_poison(dev->adev, AMDGPU_RAS_BLOCK__SDMA);
> break;
> default:
> dev_warn(dev->adev->dev,
More information about the amd-gfx
mailing list