[PATCH v2 1/2] dmr/amdgpu: Avoid HW GPU reset for RAS.
Zhou1, Tao
Tao.Zhou1 at amd.com
Fri Aug 30 01:51:18 UTC 2019
> -----Original Message-----
> From: Andrey Grodzovsky <andrey.grodzovsky at amd.com>
> Sent: 2019年8月30日 8:54
> To: amd-gfx at lists.freedesktop.org
> Cc: alexdeucher at gmail.com; Zhang, Hawking <Hawking.Zhang at amd.com>;
> ckoenig.leichtzumerken at gmail.com; Zhou1, Tao <Tao.Zhou1 at amd.com>;
> Grodzovsky, Andrey <Andrey.Grodzovsky at amd.com>
> Subject: [PATCH v2 1/2] dmr/amdgpu: Avoid HW GPU reset for RAS.
>
> Problem:
> Under certain conditions, when some IP bocks take a RAS error, we can get
[Tao] typo: "dmr/amdgpu" -> "drm/amdgpu", "IP bocks" -> "IP blocks"
> into a situation where a GPU reset is not possible due to issues in RAS in
> SMU/PSP.
>
> Temporary fix until proper solution in PSP/SMU is ready:
> When uncorrectable error happens the DF will unconditionally broadcast
> error event packets to all its clients/slave upon receiving fatal error event and
> freeze all its outbound queues, err_event_athub interrupt will be triggered.
> In such case and we use this interrupt
> to issue GPU reset. THe GPU reset code is modified for such case to avoid HW
> reset, only stops schedulers, deatches all in progress and not yet scheduled
> job's fences, set error code on them and signals.
> Also reject any new incoming job submissions from user space.
> All this is done to notify the applications of the problem.
>
> v2:
> Extract amdgpu_amdkfd_pre/post_reset from
> amdgpu_device_lock/unlock_adev Move
> amdgpu_job_stop_all_jobs_on_sched to amdgpu_job.c Remove print param
> from amdgpu_ras_query_error_count
>
> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 4 +++
> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 46
> +++++++++++++++++++++++-------
> drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 5 ++++
> drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 38
> ++++++++++++++++++++++++
> drivers/gpu/drm/amd/amdgpu/amdgpu_job.h | 3 ++
> drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 6 ++++
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 22 ++++++++++++--
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 10 +++++++
> drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 10 ++++---
> drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 24 +++++++++-------
> drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c | 5 ++++
> drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c | 32 +++++++++++----------
> 12 files changed, 163 insertions(+), 42 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> index 9da681e..300adb8 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> @@ -38,6 +38,7 @@
> #include "amdgpu_gmc.h"
> #include "amdgpu_gem.h"
> #include "amdgpu_display.h"
> +#include "amdgpu_ras.h"
>
> #if defined(HAVE_DRM_FREE_LARGE)
> #define kvfree drm_free_large
> @@ -1461,6 +1462,9 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void
> *data, struct drm_file *filp)
> bool reserved_buffers = false;
> int i, r;
>
> + if (amdgpu_ras_intr_triggered())
> + return -EHWPOISON;
> +
> if (!adev->accel_working)
> return -EBUSY;
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index a5daccc..d3a078b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -3727,25 +3727,18 @@ static bool amdgpu_device_lock_adev(struct
> amdgpu_device *adev, bool trylock)
> adev->mp1_state = PP_MP1_STATE_NONE;
> break;
> }
> - /* Block kfd: SRIOV would do it separately */
> - if (!amdgpu_sriov_vf(adev))
> - amdgpu_amdkfd_pre_reset(adev);
>
> return true;
> }
>
> static void amdgpu_device_unlock_adev(struct amdgpu_device *adev) {
> - /*unlock kfd: SRIOV would do it separately */
> - if (!amdgpu_sriov_vf(adev))
> - amdgpu_amdkfd_post_reset(adev);
> amdgpu_vf_error_trans_all(adev);
> adev->mp1_state = PP_MP1_STATE_NONE;
> adev->in_gpu_reset = 0;
> mutex_unlock(&adev->lock_reset);
> }
>
> -
> /**
> * amdgpu_device_gpu_recover - reset the asic and recover scheduler
> *
> @@ -3765,11 +3758,12 @@ int amdgpu_device_gpu_recover(struct
> amdgpu_device *adev,
> struct amdgpu_hive_info *hive = NULL;
> struct amdgpu_device *tmp_adev = NULL;
> int i, r = 0;
> + bool in_ras_intr = amdgpu_ras_intr_triggered();
>
> need_full_reset = job_signaled = false;
> INIT_LIST_HEAD(&device_list);
>
> - dev_info(adev->dev, "GPU reset begin!\n");
> + dev_info(adev->dev, "GPU %s begin!\n", in_ras_intr ? "jobs
> +stop":"reset");
>
> cancel_delayed_work_sync(&adev->delayed_init_work);
>
> @@ -3796,9 +3790,16 @@ int amdgpu_device_gpu_recover(struct
> amdgpu_device *adev,
> return 0;
> }
>
> + /* Block kfd: SRIOV would do it separately */
> + if (!amdgpu_sriov_vf(adev))
> + amdgpu_amdkfd_pre_reset(adev);
> +
> /* Build list of devices to reset */
> if (adev->gmc.xgmi.num_physical_nodes > 1) {
> if (!hive) {
> + /*unlock kfd: SRIOV would do it separately */
> + if (!amdgpu_sriov_vf(adev))
> + amdgpu_amdkfd_post_reset(adev);
> amdgpu_device_unlock_adev(adev);
> return -ENODEV;
> }
> @@ -3824,7 +3825,7 @@ int amdgpu_device_gpu_recover(struct
> amdgpu_device *adev,
> /* block all schedulers and reset given job's ring */
> list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
> /* disable ras on ALL IPs */
> - if (amdgpu_device_ip_need_full_reset(tmp_adev))
> + if (!in_ras_intr &&
> amdgpu_device_ip_need_full_reset(tmp_adev))
> amdgpu_ras_suspend(tmp_adev);
>
> for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { @@ -3834,10
> +3835,26 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
> continue;
>
> drm_sched_stop(&ring->sched, job ? &job->base :
> NULL);
> +
> + if (in_ras_intr)
> + amdgpu_job_stop_all_jobs_on_sched(&ring-
> >sched);
> }
> }
>
>
> + if (in_ras_intr) {
> + list_for_each_entry(tmp_adev, device_list_handle,
> gmc.xgmi.head) {
> + if (tmp_adev == adev)
> + continue;
> +
> + if (amdgpu_device_lock_adev(tmp_adev, false)
> && !amdgpu_sriov_vf(tmp_adev))
> + amdgpu_amdkfd_pre_reset(tmp_adev);
> +
> + }
> +
> + goto skip_sched_resume;
> + }
> +
> /*
> * Must check guilty signal here since after this point all old
> * HW fences are force signaled.
> @@ -3872,7 +3889,9 @@ int amdgpu_device_gpu_recover(struct
> amdgpu_device *adev,
> if (tmp_adev == adev)
> continue;
>
> - amdgpu_device_lock_adev(tmp_adev, false);
> + if (amdgpu_device_lock_adev(tmp_adev, false)
> && !amdgpu_sriov_vf(tmp_adev))
> + amdgpu_amdkfd_pre_reset(tmp_adev);
> +
> r = amdgpu_device_pre_asic_reset(tmp_adev,
> NULL,
> &need_full_reset);
> @@ -3900,6 +3919,7 @@ int amdgpu_device_gpu_recover(struct
> amdgpu_device *adev,
>
> /* Post ASIC reset for all devs .*/
> list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
> +
> for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
> struct amdgpu_ring *ring = tmp_adev->rings[i];
>
> @@ -3926,7 +3946,13 @@ int amdgpu_device_gpu_recover(struct
> amdgpu_device *adev,
> } else {
> dev_info(tmp_adev->dev, "GPU reset(%d)
> succeeded!\n", atomic_read(&adev->gpu_reset_counter));
> }
> + }
>
> +skip_sched_resume:
> + list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
> + /*unlock kfd: SRIOV would do it separately */
> + if (!in_ras_intr && !amdgpu_sriov_vf(tmp_adev))
> + amdgpu_amdkfd_post_reset(tmp_adev);
> amdgpu_device_unlock_adev(tmp_adev);
> }
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> index 151d7f2..757fd6d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> @@ -40,6 +40,8 @@
>
> #include "amdgpu_amdkfd.h"
>
> +#include "amdgpu_ras.h"
> +
> /*
> * KMS wrapper.
> * - 3.0.0 - initial driver
> @@ -1179,6 +1181,9 @@ amdgpu_pci_shutdown(struct pci_dev *pdev)
> struct drm_device *dev = pci_get_drvdata(pdev);
> struct amdgpu_device *adev = dev->dev_private;
>
> + if (amdgpu_ras_intr_triggered())
> + return;
> +
> /* if we are running in a VM, make sure the device
> * torn down properly on reboot/shutdown.
> * unfortunately we can't detect certain diff --git
> a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> index 4d67b77..b12981e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> @@ -250,6 +250,44 @@ static struct dma_fence *amdgpu_job_run(struct
> drm_sched_job *sched_job)
> return fence;
> }
>
> +#define to_drm_sched_job(sched_job) \
> + container_of((sched_job), struct drm_sched_job,
> queue_node)
> +
> +void amdgpu_job_stop_all_jobs_on_sched(struct drm_gpu_scheduler
> *sched)
> +{
> + struct drm_sched_job *s_job;
> + struct drm_sched_entity *s_entity = NULL;
> + int i;
> +
> + /* Signal all jobs not yet scheduled */
> + for (i = DRM_SCHED_PRIORITY_MAX - 1; i >=
> DRM_SCHED_PRIORITY_MIN; i--) {
> + struct drm_sched_rq *rq = &sched->sched_rq[i];
> +
> + if (!rq)
> + continue;
> +
> + spin_lock(&rq->lock);
> + list_for_each_entry(s_entity, &rq->entities, list) {
> + while ((s_job =
> to_drm_sched_job(spsc_queue_pop(&s_entity->job_queue)))) {
> + struct drm_sched_fence *s_fence = s_job-
> >s_fence;
> +
> + dma_fence_signal(&s_fence->scheduled);
> + dma_fence_set_error(&s_fence->finished, -
> EHWPOISON);
> + dma_fence_signal(&s_fence->finished);
> + }
> + }
> + spin_unlock(&rq->lock);
> + }
> +
> + /* Signal all jobs already scheduled to HW */
> + list_for_each_entry(s_job, &sched->ring_mirror_list, node) {
> + struct drm_sched_fence *s_fence = s_job->s_fence;
> +
> + dma_fence_set_error(&s_fence->finished, -EHWPOISON);
> + dma_fence_signal(&s_fence->finished);
> + }
> +}
> +
> const struct drm_sched_backend_ops amdgpu_sched_ops = {
> .dependency = amdgpu_job_dependency,
> .run_job = amdgpu_job_run,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> index 51e6250..dc7ee93 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> @@ -76,4 +76,7 @@ int amdgpu_job_submit(struct amdgpu_job *job, struct
> drm_sched_entity *entity,
> void *owner, struct dma_fence **f); int
> amdgpu_job_submit_direct(struct amdgpu_job *job, struct amdgpu_ring
> *ring,
> struct dma_fence **fence);
> +
> +void amdgpu_job_stop_all_jobs_on_sched(struct drm_gpu_scheduler
> +*sched);
> +
> #endif
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> index da2143d..ced766c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> @@ -1046,6 +1046,12 @@ int amdgpu_driver_open_kms(struct drm_device
> *dev, struct drm_file *file_priv)
> /* Ensure IB tests are run on ring */
> flush_delayed_work(&adev->delayed_init_work);
>
> +
> + if (amdgpu_ras_intr_triggered()) {
> + DRM_ERROR("RAS Intr triggered, device disabled!!");
> + return -EHWPOISON;
> + }
> +
> file_priv->driver_priv = NULL;
>
> r = pm_runtime_get_sync(dev->dev);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 2d5897a..7b00ac6 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -24,6 +24,8 @@
> #include <linux/debugfs.h>
> #include <linux/list.h>
> #include <linux/module.h>
> +#include <linux/reboot.h>
> +#include <linux/syscalls.h>
> #include "amdgpu.h"
> #include "amdgpu_ras.h"
> #include "amdgpu_atomfirmware.h"
> @@ -64,6 +66,9 @@ const char *ras_block_string[] = {
> /* inject address is 52 bits */
> #define RAS_UMC_INJECT_ADDR_LIMIT (0x1ULL << 52)
>
> +
> +atomic_t amdgpu_ras_in_intr = ATOMIC_INIT(0);
> +
> static int amdgpu_ras_reserve_vram(struct amdgpu_device *adev,
> uint64_t offset, uint64_t size,
> struct amdgpu_bo **bo_ptr);
> @@ -188,6 +193,10 @@ static int
> amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
>
> return 0;
> }
> +
> +static struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device
> *adev,
> + struct ras_common_if *head);
> +
> /**
> * DOC: AMDGPU RAS debugfs control interface
> *
> @@ -627,12 +636,14 @@ int amdgpu_ras_error_query(struct
> amdgpu_device *adev,
> info->ue_count = obj->err_data.ue_count;
> info->ce_count = obj->err_data.ce_count;
>
> - if (err_data.ce_count)
> + if (err_data.ce_count) {
> dev_info(adev->dev, "%ld correctable errors detected in %s
> block\n",
> obj->err_data.ce_count, ras_block_str(info-
> >head.block));
> - if (err_data.ue_count)
> + }
> + if (err_data.ue_count) {
> dev_info(adev->dev, "%ld uncorrectable errors detected
> in %s block\n",
> obj->err_data.ue_count, ras_block_str(info-
> >head.block));
> + }
>
> return 0;
> }
> @@ -1718,3 +1729,10 @@ int amdgpu_ras_fini(struct amdgpu_device *adev)
>
> return 0;
> }
> +
> +void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev) {
> + if (atomic_cmpxchg(&amdgpu_ras_in_intr, 0, 1) == 0) {
> + DRM_WARN("RAS event of type
> ERREVENT_ATHUB_INTERRUPT detected! Stopping all GPU jobs.\n");
> + }
> +}
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> index 5a0df73..cf5ffb6 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> @@ -600,4 +600,14 @@ int amdgpu_ras_interrupt_remove_handler(struct
> amdgpu_device *adev,
>
> int amdgpu_ras_interrupt_dispatch(struct amdgpu_device *adev,
> struct ras_dispatch_if *info);
> +
> +extern atomic_t amdgpu_ras_in_intr;
> +
> +static inline bool amdgpu_ras_intr_triggered(void) {
> + return !!atomic_read(&amdgpu_ras_in_intr);
> +}
> +
> +void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev);
> +
> #endif
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> index b2c86a0..e7a83f6 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> @@ -5669,10 +5669,12 @@ static int gfx_v9_0_process_ras_data_cb(struct
> amdgpu_device *adev,
> struct amdgpu_iv_entry *entry)
> {
> /* TODO ue will trigger an interrupt. */
> - kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
> - if (adev->gfx.funcs->query_ras_error_count)
> - adev->gfx.funcs->query_ras_error_count(adev, err_data);
> - amdgpu_ras_reset_gpu(adev, 0);
> + if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX)) {
> + kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
> + if (adev->gfx.funcs->query_ras_error_count)
> + adev->gfx.funcs->query_ras_error_count(adev,
> err_data);
> + amdgpu_ras_reset_gpu(adev, 0);
> + }
> return AMDGPU_RAS_SUCCESS;
> }
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index 43b4fbc..87a66c2 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -243,18 +243,20 @@ static int gmc_v9_0_process_ras_data_cb(struct
> amdgpu_device *adev,
> struct ras_err_data *err_data,
> struct amdgpu_iv_entry *entry)
> {
> - kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
> - if (adev->umc.funcs->query_ras_error_count)
> - adev->umc.funcs->query_ras_error_count(adev, err_data);
> - /* umc query_ras_error_address is also responsible for clearing
> - * error status
> - */
> - if (adev->umc.funcs->query_ras_error_address)
> - adev->umc.funcs->query_ras_error_address(adev, err_data);
> + if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX)) {
[Tao] Comment is recommended for the if condition
> + kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
> + if (adev->umc.funcs->query_ras_error_count)
> + adev->umc.funcs->query_ras_error_count(adev,
> err_data);
> + /* umc query_ras_error_address is also responsible for
> clearing
> + * error status
> + */
> + if (adev->umc.funcs->query_ras_error_address)
> + adev->umc.funcs->query_ras_error_address(adev,
> err_data);
>
> - /* only uncorrectable error needs gpu reset */
> - if (err_data->ue_count)
> - amdgpu_ras_reset_gpu(adev, 0);
> + /* only uncorrectable error needs gpu reset */
> + if (err_data->ue_count)
> + amdgpu_ras_reset_gpu(adev, 0);
> + }
>
> return AMDGPU_RAS_SUCCESS;
> }
> diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
> b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
> index 367f9d6..545990c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
> +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
> @@ -30,6 +30,7 @@
> #include "nbio/nbio_7_4_0_smn.h"
> #include "ivsrcid/nbio/irqsrcs_nbif_7_4.h"
> #include <uapi/linux/kfd_ioctl.h>
> +#include "amdgpu_ras.h"
>
> #define smnNBIF_MGCG_CTRL_LCLK 0x1013a21c
>
> @@ -329,6 +330,8 @@ static void
> nbio_v7_4_handle_ras_controller_intr_no_bifring(struct amdgpu_device
> BIF_DOORBELL_INT_CNTL,
>
> RAS_CNTLR_INTERRUPT_CLEAR, 1);
> WREG32_SOC15(NBIO, 0, mmBIF_DOORBELL_INT_CNTL,
> bif_doorbell_intr_cntl);
> +
> + amdgpu_ras_global_ras_isr(adev);
> }
> }
>
> @@ -344,6 +347,8 @@ static void
> nbio_v7_4_handle_ras_err_event_athub_intr_no_bifring(struct amdgpu_d
> BIF_DOORBELL_INT_CNTL,
>
> RAS_ATHUB_ERR_EVENT_INTERRUPT_CLEAR, 1);
> WREG32_SOC15(NBIO, 0, mmBIF_DOORBELL_INT_CNTL,
> bif_doorbell_intr_cntl);
> +
> + amdgpu_ras_global_ras_isr(adev);
> }
> }
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
> b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
> index 956432f..438e504 100644
> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
> @@ -1972,24 +1972,26 @@ static int
> sdma_v4_0_process_ras_data_cb(struct amdgpu_device *adev,
> uint32_t err_source;
> int instance;
>
> - instance = sdma_v4_0_irq_id_to_seq(entry->client_id);
> - if (instance < 0)
> - return 0;
> + if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX)) {
> + instance = sdma_v4_0_irq_id_to_seq(entry->client_id);
> + if (instance < 0)
> + return 0;
>
> - switch (entry->src_id) {
> - case SDMA0_4_0__SRCID__SDMA_SRAM_ECC:
> - err_source = 0;
> - break;
> - case SDMA0_4_0__SRCID__SDMA_ECC:
> - err_source = 1;
> - break;
> - default:
> - return 0;
> - }
> + switch (entry->src_id) {
> + case SDMA0_4_0__SRCID__SDMA_SRAM_ECC:
> + err_source = 0;
> + break;
> + case SDMA0_4_0__SRCID__SDMA_ECC:
> + err_source = 1;
> + break;
> + default:
> + return 0;
> + }
>
> - kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
> + kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
>
> - amdgpu_ras_reset_gpu(adev, 0);
> + amdgpu_ras_reset_gpu(adev, 0);
> + }
>
> return AMDGPU_RAS_SUCCESS;
> }
> --
> 2.7.4
More information about the amd-gfx
mailing list