[PATCH v3 2/3] dmr/amdgpu: Avoid HW GPU reset for RAS.

Kuehling, Felix Felix.Kuehling at amd.com
Fri Aug 30 20:29:41 UTC 2019


On 2019-08-30 12:39 p.m., Andrey Grodzovsky wrote:
> Problem:
> Under certain conditions, when some IP bocks take a RAS error,
> we can get into a situation where a GPU reset is not possible
> due to issues in RAS in SMU/PSP.
>
> Temporary fix until proper solution in PSP/SMU is ready:
> When uncorrectable error happens the DF will unconditionally
> broadcast error event packets to all its clients/slave upon
> receiving fatal error event and freeze all its outbound queues,
> err_event_athub interrupt  will be triggered.
> In such case and we use this interrupt
> to issue GPU reset. THe GPU reset code is modified for such case to avoid HW
> reset, only stops schedulers, deatches all in progress and not yet scheduled
> job's fences, set error code on them and signals.
> Also reject any new incoming job submissions from user space.
> All this is done to notify the applications of the problem.
>
> v2:
> Extract amdgpu_amdkfd_pre/post_reset from amdgpu_device_lock/unlock_adev
> Move amdgpu_job_stop_all_jobs_on_sched to amdgpu_job.c
> Remove print param from amdgpu_ras_query_error_count
>
> v3:
> Update based on previous bug fixing patch to properly call amdgpu_amdkfd_pre_reset
> for other XGMI hive memebers.
>
> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky at amd.com>

The KFD part looks good to me. Acked-by: Felix Kuehling 
<Felix.Kuehling at amd.com>


> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c     |  4 ++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 38 ++++++++++++++++++++++--------
>   drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    |  5 ++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.c    | 38 ++++++++++++++++++++++++++++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.h    |  3 +++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c    |  6 +++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c    | 22 +++++++++++++++--
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h    | 10 ++++++++
>   drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c      | 10 ++++----
>   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c      | 24 ++++++++++---------
>   drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c     |  5 ++++
>   drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c     | 32 +++++++++++++------------
>   12 files changed, 155 insertions(+), 42 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> index d860170..494c384 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> @@ -38,6 +38,7 @@
>   #include "amdgpu_gmc.h"
>   #include "amdgpu_gem.h"
>   #include "amdgpu_display.h"
> +#include "amdgpu_ras.h"
>   
>   static int amdgpu_cs_user_fence_chunk(struct amdgpu_cs_parser *p,
>   				      struct drm_amdgpu_cs_chunk_fence *data,
> @@ -1438,6 +1439,9 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
>   	bool reserved_buffers = false;
>   	int i, r;
>   
> +	if (amdgpu_ras_intr_triggered())
> +		return -EHWPOISON;
> +
>   	if (!adev->accel_working)
>   		return -EBUSY;
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 19f6624..c9825ae 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -3727,25 +3727,18 @@ static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, bool trylock)
>   		adev->mp1_state = PP_MP1_STATE_NONE;
>   		break;
>   	}
> -	/* Block kfd: SRIOV would do it separately */
> -	if (!amdgpu_sriov_vf(adev))
> -                amdgpu_amdkfd_pre_reset(adev);
>   
>   	return true;
>   }
>   
>   static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
>   {
> -	/*unlock kfd: SRIOV would do it separately */
> -	if (!amdgpu_sriov_vf(adev))
> -                amdgpu_amdkfd_post_reset(adev);
>   	amdgpu_vf_error_trans_all(adev);
>   	adev->mp1_state = PP_MP1_STATE_NONE;
>   	adev->in_gpu_reset = 0;
>   	mutex_unlock(&adev->lock_reset);
>   }
>   
> -
>   /**
>    * amdgpu_device_gpu_recover - reset the asic and recover scheduler
>    *
> @@ -3765,11 +3758,12 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>   	struct amdgpu_hive_info *hive = NULL;
>   	struct amdgpu_device *tmp_adev = NULL;
>   	int i, r = 0;
> +	bool in_ras_intr = amdgpu_ras_intr_triggered();
>   
>   	need_full_reset = job_signaled = false;
>   	INIT_LIST_HEAD(&device_list);
>   
> -	dev_info(adev->dev, "GPU reset begin!\n");
> +	dev_info(adev->dev, "GPU %s begin!\n", in_ras_intr ? "jobs stop":"reset");
>   
>   	cancel_delayed_work_sync(&adev->delayed_init_work);
>   
> @@ -3796,9 +3790,16 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>   		return 0;
>   	}
>   
> +	/* Block kfd: SRIOV would do it separately */
> +	if (!amdgpu_sriov_vf(adev))
> +                amdgpu_amdkfd_pre_reset(adev);
> +
>   	/* Build list of devices to reset */
>   	if  (adev->gmc.xgmi.num_physical_nodes > 1) {
>   		if (!hive) {
> +			/*unlock kfd: SRIOV would do it separately */
> +			if (!amdgpu_sriov_vf(adev))
> +		                amdgpu_amdkfd_post_reset(adev);
>   			amdgpu_device_unlock_adev(adev);
>   			return -ENODEV;
>   		}
> @@ -3816,8 +3817,12 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>   
>   	/* block all schedulers and reset given job's ring */
>   	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
> -		if (tmp_adev != adev)
> +		if (tmp_adev != adev) {
>   			amdgpu_device_lock_adev(tmp_adev, false);
> +			if (!amdgpu_sriov_vf(tmp_adev))
> +			                amdgpu_amdkfd_pre_reset(tmp_adev);
> +		}
> +
>   		/*
>   		 * Mark these ASICs to be reseted as untracked first
>   		 * And add them back after reset completed
> @@ -3825,7 +3830,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>   		amdgpu_unregister_gpu_instance(tmp_adev);
>   
>   		/* disable ras on ALL IPs */
> -		if (amdgpu_device_ip_need_full_reset(tmp_adev))
> +		if (!in_ras_intr && amdgpu_device_ip_need_full_reset(tmp_adev))
>   			amdgpu_ras_suspend(tmp_adev);
>   
>   		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
> @@ -3835,10 +3840,16 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>   				continue;
>   
>   			drm_sched_stop(&ring->sched, job ? &job->base : NULL);
> +
> +			if (in_ras_intr)
> +				amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
>   		}
>   	}
>   
>   
> +	if (in_ras_intr)
> +		goto skip_sched_resume;
> +
>   	/*
>   	 * Must check guilty signal here since after this point all old
>   	 * HW fences are force signaled.
> @@ -3897,6 +3908,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>   
>   	/* Post ASIC reset for all devs .*/
>   	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
> +
>   		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>   			struct amdgpu_ring *ring = tmp_adev->rings[i];
>   
> @@ -3923,7 +3935,13 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>   		} else {
>   			dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter));
>   		}
> +	}
>   
> +skip_sched_resume:
> +	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
> +		/*unlock kfd: SRIOV would do it separately */
> +		if (!in_ras_intr && !amdgpu_sriov_vf(tmp_adev))
> +	                amdgpu_amdkfd_post_reset(tmp_adev);
>   		amdgpu_device_unlock_adev(tmp_adev);
>   	}
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> index 7679fe8..c73d26a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> @@ -40,6 +40,8 @@
>   
>   #include "amdgpu_amdkfd.h"
>   
> +#include "amdgpu_ras.h"
> +
>   /*
>    * KMS wrapper.
>    * - 3.0.0 - initial driver
> @@ -1180,6 +1182,9 @@ amdgpu_pci_shutdown(struct pci_dev *pdev)
>   	struct drm_device *dev = pci_get_drvdata(pdev);
>   	struct amdgpu_device *adev = dev->dev_private;
>   
> +	if (amdgpu_ras_intr_triggered())
> +		return;
> +
>   	/* if we are running in a VM, make sure the device
>   	 * torn down properly on reboot/shutdown.
>   	 * unfortunately we can't detect certain
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> index 4d67b77..b12981e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> @@ -250,6 +250,44 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job)
>   	return fence;
>   }
>   
> +#define to_drm_sched_job(sched_job)		\
> +		container_of((sched_job), struct drm_sched_job, queue_node)
> +
> +void amdgpu_job_stop_all_jobs_on_sched(struct drm_gpu_scheduler *sched)
> +{
> +	struct drm_sched_job *s_job;
> +	struct drm_sched_entity *s_entity = NULL;
> +	int i;
> +
> +	/* Signal all jobs not yet scheduled */
> +	for (i = DRM_SCHED_PRIORITY_MAX - 1; i >= DRM_SCHED_PRIORITY_MIN; i--) {
> +		struct drm_sched_rq *rq = &sched->sched_rq[i];
> +
> +		if (!rq)
> +			continue;
> +
> +		spin_lock(&rq->lock);
> +		list_for_each_entry(s_entity, &rq->entities, list) {
> +			while ((s_job = to_drm_sched_job(spsc_queue_pop(&s_entity->job_queue)))) {
> +				struct drm_sched_fence *s_fence = s_job->s_fence;
> +
> +				dma_fence_signal(&s_fence->scheduled);
> +				dma_fence_set_error(&s_fence->finished, -EHWPOISON);
> +				dma_fence_signal(&s_fence->finished);
> +			}
> +		}
> +		spin_unlock(&rq->lock);
> +	}
> +
> +	/* Signal all jobs already scheduled to HW */
> +	list_for_each_entry(s_job, &sched->ring_mirror_list, node) {
> +		struct drm_sched_fence *s_fence = s_job->s_fence;
> +
> +		dma_fence_set_error(&s_fence->finished, -EHWPOISON);
> +		dma_fence_signal(&s_fence->finished);
> +	}
> +}
> +
>   const struct drm_sched_backend_ops amdgpu_sched_ops = {
>   	.dependency = amdgpu_job_dependency,
>   	.run_job = amdgpu_job_run,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> index 51e6250..dc7ee93 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> @@ -76,4 +76,7 @@ int amdgpu_job_submit(struct amdgpu_job *job, struct drm_sched_entity *entity,
>   		      void *owner, struct dma_fence **f);
>   int amdgpu_job_submit_direct(struct amdgpu_job *job, struct amdgpu_ring *ring,
>   			     struct dma_fence **fence);
> +
> +void amdgpu_job_stop_all_jobs_on_sched(struct drm_gpu_scheduler *sched);
> +
>   #endif
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> index 35a0866..535f690 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> @@ -1046,6 +1046,12 @@ int amdgpu_driver_open_kms(struct drm_device *dev, struct drm_file *file_priv)
>   	/* Ensure IB tests are run on ring */
>   	flush_delayed_work(&adev->delayed_init_work);
>   
> +
> +	if (amdgpu_ras_intr_triggered()) {
> +		DRM_ERROR("RAS Intr triggered, device disabled!!");
> +		return -EHWPOISON;
> +	}
> +
>   	file_priv->driver_priv = NULL;
>   
>   	r = pm_runtime_get_sync(dev->dev);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index aa51c00..1cc34de 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -24,6 +24,8 @@
>   #include <linux/debugfs.h>
>   #include <linux/list.h>
>   #include <linux/module.h>
> +#include <linux/reboot.h>
> +#include <linux/syscalls.h>
>   #include "amdgpu.h"
>   #include "amdgpu_ras.h"
>   #include "amdgpu_atomfirmware.h"
> @@ -64,6 +66,9 @@ const char *ras_block_string[] = {
>   /* inject address is 52 bits */
>   #define	RAS_UMC_INJECT_ADDR_LIMIT	(0x1ULL << 52)
>   
> +
> +atomic_t amdgpu_ras_in_intr = ATOMIC_INIT(0);
> +
>   static int amdgpu_ras_reserve_vram(struct amdgpu_device *adev,
>   		uint64_t offset, uint64_t size,
>   		struct amdgpu_bo **bo_ptr);
> @@ -188,6 +193,10 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
>   
>   	return 0;
>   }
> +
> +static struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev,
> +		struct ras_common_if *head);
> +
>   /**
>    * DOC: AMDGPU RAS debugfs control interface
>    *
> @@ -627,12 +636,14 @@ int amdgpu_ras_error_query(struct amdgpu_device *adev,
>   	info->ue_count = obj->err_data.ue_count;
>   	info->ce_count = obj->err_data.ce_count;
>   
> -	if (err_data.ce_count)
> +	if (err_data.ce_count) {
>   		dev_info(adev->dev, "%ld correctable errors detected in %s block\n",
>   			 obj->err_data.ce_count, ras_block_str(info->head.block));
> -	if (err_data.ue_count)
> +	}
> +	if (err_data.ue_count) {
>   		dev_info(adev->dev, "%ld uncorrectable errors detected in %s block\n",
>   			 obj->err_data.ue_count, ras_block_str(info->head.block));
> +	}
>   
>   	return 0;
>   }
> @@ -1718,3 +1729,10 @@ int amdgpu_ras_fini(struct amdgpu_device *adev)
>   
>   	return 0;
>   }
> +
> +void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev)
> +{
> +	if (atomic_cmpxchg(&amdgpu_ras_in_intr, 0, 1) == 0) {
> +		DRM_WARN("RAS event of type ERREVENT_ATHUB_INTERRUPT detected! Stopping all GPU jobs.\n");
> +	}
> +}
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> index fc4fb0f..3ec2a87 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> @@ -600,4 +600,14 @@ int amdgpu_ras_interrupt_remove_handler(struct amdgpu_device *adev,
>   
>   int amdgpu_ras_interrupt_dispatch(struct amdgpu_device *adev,
>   		struct ras_dispatch_if *info);
> +
> +extern atomic_t amdgpu_ras_in_intr;
> +
> +static inline bool amdgpu_ras_intr_triggered(void)
> +{
> +	return !!atomic_read(&amdgpu_ras_in_intr);
> +}
> +
> +void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev);
> +
>   #endif
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> index 93e3e89..817997b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> @@ -5676,10 +5676,12 @@ static int gfx_v9_0_process_ras_data_cb(struct amdgpu_device *adev,
>   		struct amdgpu_iv_entry *entry)
>   {
>   	/* TODO ue will trigger an interrupt. */
> -	kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
> -	if (adev->gfx.funcs->query_ras_error_count)
> -		adev->gfx.funcs->query_ras_error_count(adev, err_data);
> -	amdgpu_ras_reset_gpu(adev, 0);
> +	if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX)) {
> +		kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
> +		if (adev->gfx.funcs->query_ras_error_count)
> +			adev->gfx.funcs->query_ras_error_count(adev, err_data);
> +		amdgpu_ras_reset_gpu(adev, 0);
> +	}
>   	return AMDGPU_RAS_SUCCESS;
>   }
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index 5eb17c7..2a6ac60 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -243,18 +243,20 @@ static int gmc_v9_0_process_ras_data_cb(struct amdgpu_device *adev,
>   		struct ras_err_data *err_data,
>   		struct amdgpu_iv_entry *entry)
>   {
> -	kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
> -	if (adev->umc.funcs->query_ras_error_count)
> -		adev->umc.funcs->query_ras_error_count(adev, err_data);
> -	/* umc query_ras_error_address is also responsible for clearing
> -	 * error status
> -	 */
> -	if (adev->umc.funcs->query_ras_error_address)
> -		adev->umc.funcs->query_ras_error_address(adev, err_data);
> +	if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX)) {
> +		kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
> +		if (adev->umc.funcs->query_ras_error_count)
> +			adev->umc.funcs->query_ras_error_count(adev, err_data);
> +		/* umc query_ras_error_address is also responsible for clearing
> +		 * error status
> +		 */
> +		if (adev->umc.funcs->query_ras_error_address)
> +			adev->umc.funcs->query_ras_error_address(adev, err_data);
>   
> -	/* only uncorrectable error needs gpu reset */
> -	if (err_data->ue_count)
> -		amdgpu_ras_reset_gpu(adev, 0);
> +		/* only uncorrectable error needs gpu reset */
> +		if (err_data->ue_count)
> +			amdgpu_ras_reset_gpu(adev, 0);
> +	}
>   
>   	return AMDGPU_RAS_SUCCESS;
>   }
> diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
> index 367f9d6..545990c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
> +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
> @@ -30,6 +30,7 @@
>   #include "nbio/nbio_7_4_0_smn.h"
>   #include "ivsrcid/nbio/irqsrcs_nbif_7_4.h"
>   #include <uapi/linux/kfd_ioctl.h>
> +#include "amdgpu_ras.h"
>   
>   #define smnNBIF_MGCG_CTRL_LCLK	0x1013a21c
>   
> @@ -329,6 +330,8 @@ static void nbio_v7_4_handle_ras_controller_intr_no_bifring(struct amdgpu_device
>   						BIF_DOORBELL_INT_CNTL,
>   						RAS_CNTLR_INTERRUPT_CLEAR, 1);
>   		WREG32_SOC15(NBIO, 0, mmBIF_DOORBELL_INT_CNTL, bif_doorbell_intr_cntl);
> +
> +		amdgpu_ras_global_ras_isr(adev);
>   	}
>   }
>   
> @@ -344,6 +347,8 @@ static void nbio_v7_4_handle_ras_err_event_athub_intr_no_bifring(struct amdgpu_d
>   						BIF_DOORBELL_INT_CNTL,
>   						RAS_ATHUB_ERR_EVENT_INTERRUPT_CLEAR, 1);
>   		WREG32_SOC15(NBIO, 0, mmBIF_DOORBELL_INT_CNTL, bif_doorbell_intr_cntl);
> +
> +		amdgpu_ras_global_ras_isr(adev);
>   	}
>   }
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
> index b3ed533..b05428f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
> @@ -1972,24 +1972,26 @@ static int sdma_v4_0_process_ras_data_cb(struct amdgpu_device *adev,
>   	uint32_t err_source;
>   	int instance;
>   
> -	instance = sdma_v4_0_irq_id_to_seq(entry->client_id);
> -	if (instance < 0)
> -		return 0;
> +	if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX)) {
> +		instance = sdma_v4_0_irq_id_to_seq(entry->client_id);
> +		if (instance < 0)
> +			return 0;
>   
> -	switch (entry->src_id) {
> -	case SDMA0_4_0__SRCID__SDMA_SRAM_ECC:
> -		err_source = 0;
> -		break;
> -	case SDMA0_4_0__SRCID__SDMA_ECC:
> -		err_source = 1;
> -		break;
> -	default:
> -		return 0;
> -	}
> +		switch (entry->src_id) {
> +		case SDMA0_4_0__SRCID__SDMA_SRAM_ECC:
> +			err_source = 0;
> +			break;
> +		case SDMA0_4_0__SRCID__SDMA_ECC:
> +			err_source = 1;
> +			break;
> +		default:
> +			return 0;
> +		}
>   
> -	kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
> +		kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
>   
> -	amdgpu_ras_reset_gpu(adev, 0);
> +		amdgpu_ras_reset_gpu(adev, 0);
> +	}
>   
>   	return AMDGPU_RAS_SUCCESS;
>   }


More information about the amd-gfx mailing list