[PATCH 1/2] dmr/amdgpu: Avoid HW GPU reset for RAS.
Koenig, Christian
Christian.Koenig at amd.com
Thu Aug 29 14:06:20 UTC 2019
Am 29.08.19 um 16:03 schrieb Grodzovsky, Andrey:
> On 8/29/19 3:30 AM, Christian König wrote:
>> Am 28.08.19 um 22:00 schrieb Andrey Grodzovsky:
>>> Problem:
>>> Under certain conditions, when some IP bocks take a RAS error,
>>> we can get into a situation where a GPU reset is not possible
>>> due to issues in RAS in SMU/PSP.
>>>
>>> Temporary fix until proper solution in PSP/SMU is ready:
>>> When uncorrectable error happens the DF will unconditionally
>>> broadcast error event packets to all its clients/slave upon
>>> receiving fatal error event and freeze all its outbound queues,
>>> err_event_athub interrupt will be triggered.
>>> In such case and we use this interrupt
>>> to issue GPU reset. THe GPU reset code is modified for such case to
>>> avoid HW
>>> reset, only stops schedulers, deatches all in progress and not yet
>>> scheduled
>>> job's fences, set error code on them and signals.
>>> Also reject any new incoming job submissions from user space.
>>> All this is done to notify the applications of the problem.
>>>
>>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky at amd.com>
>>> ---
>>> drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 4 ++
>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 98
>>> ++++++++++++++++++++++--------
>>> drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 5 ++
>>> drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 6 ++
>>> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 30 +++++++--
>>> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 12 +++-
>>> drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 10 +--
>>> drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 24 ++++----
>>> drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c | 5 ++
>>> drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c | 32 +++++-----
>>> 10 files changed, 164 insertions(+), 62 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>> index 9da681e..300adb8 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>> @@ -38,6 +38,7 @@
>>> #include "amdgpu_gmc.h"
>>> #include "amdgpu_gem.h"
>>> #include "amdgpu_display.h"
>>> +#include "amdgpu_ras.h"
>>> #if defined(HAVE_DRM_FREE_LARGE)
>>> #define kvfree drm_free_large
>>> @@ -1461,6 +1462,9 @@ int amdgpu_cs_ioctl(struct drm_device *dev,
>>> void *data, struct drm_file *filp)
>>> bool reserved_buffers = false;
>>> int i, r;
>>> + if (amdgpu_ras_intr_triggered())
>>> + return -EHWPOISON;
>>> +
>>> if (!adev->accel_working)
>>> return -EBUSY;
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> index 07a4ba0..3ecee10 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> @@ -3734,10 +3734,10 @@ static bool amdgpu_device_lock_adev(struct
>>> amdgpu_device *adev, bool trylock)
>>> return true;
>>> }
>>> -static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
>>> +static void amdgpu_device_unlock_adev(struct amdgpu_device *adev,
>>> bool skip_kfd)
>>> {
>>> /*unlock kfd: SRIOV would do it separately */
>>> - if (!amdgpu_sriov_vf(adev))
>>> + if (!amdgpu_sriov_vf(adev) && !skip_kfd)
>>> amdgpu_amdkfd_post_reset(adev);
>> It's most likely better to completely remove the call to
>> amdgpu_amdkfd_post_reset() here.
>
> Felix advised that the way to stop all KFD activity is simply to NOT
> call amdgpu_amdkfd_post_reset so that why I added this. Do you mean you
> prefer amdgpu_amdkfd_post_reset to be outside of amdgpu_device_unlock_adev ?
Yes, exactly. It doesn't seems to be related to the unlock operation in
the first place, but rather only signals the KFD that the reset is
completed.
Christian.
>
>
>>> amdgpu_vf_error_trans_all(adev);
>>> adev->mp1_state = PP_MP1_STATE_NONE;
>>> @@ -3746,6 +3746,44 @@ static void amdgpu_device_unlock_adev(struct
>>> amdgpu_device *adev)
>>> }
>>> +#define to_drm_sched_job(sched_job) \
>>> + container_of((sched_job), struct drm_sched_job, queue_node)
>>> +
>>> +static void amdgpu_stop_all_jobs_on_sched(struct drm_gpu_scheduler
>>> *sched)
>>> +{
>>> + struct drm_sched_job *s_job;
>>> + struct drm_sched_entity *s_entity = NULL;
>>> + int i;
>>> +
>>> + /* Signal all jobs not yet scheduled */
>>> + for (i = DRM_SCHED_PRIORITY_MAX - 1; i >=
>>> DRM_SCHED_PRIORITY_MIN; i--) {
>>> + struct drm_sched_rq *rq = &sched->sched_rq[i];
>>> +
>>> + if (!rq)
>>> + continue;
>>> +
>>> + spin_lock(&rq->lock);
>>> + list_for_each_entry(s_entity, &rq->entities, list) {
>>> + while ((s_job =
>>> to_drm_sched_job(spsc_queue_pop(&s_entity->job_queue)))) {
>>> + struct drm_sched_fence *s_fence = s_job->s_fence;
>>> +
>>> + dma_fence_signal(&s_fence->scheduled);
>>> + dma_fence_set_error(&s_fence->finished, -EHWPOISON);
>>> + dma_fence_signal(&s_fence->finished);
>>> + }
>>> + }
>>> + spin_unlock(&rq->lock);
>>> + }
>>> +
>>> + /* Signal all jobs already scheduled to HW */
>>> + list_for_each_entry(s_job, &sched->ring_mirror_list, node) {
>>> + struct drm_sched_fence *s_fence = s_job->s_fence;
>>> +
>>> + dma_fence_set_error(&s_fence->finished, -EHWPOISON);
>>> + dma_fence_signal(&s_fence->finished);
>>> + }
>>> +}
>> That might be better put into amdgpu_job.c.
>>
>> And I assume this is called only during GPU reset will the scheduler
>> fully stopped?
>
> Yes
>
>
>>> +
>>> /**
>>> * amdgpu_device_gpu_recover - reset the asic and recover scheduler
>>> *
>>> @@ -3765,11 +3803,12 @@ int amdgpu_device_gpu_recover(struct
>>> amdgpu_device *adev,
>>> struct amdgpu_hive_info *hive = NULL;
>>> struct amdgpu_device *tmp_adev = NULL;
>>> int i, r = 0;
>>> + bool in_ras_intr = amdgpu_ras_intr_triggered();
>>> need_full_reset = job_signaled = false;
>>> INIT_LIST_HEAD(&device_list);
>>> - dev_info(adev->dev, "GPU reset begin!\n");
>>> + dev_info(adev->dev, "GPU %s begin!\n", in_ras_intr ? "jobs
>>> stop":"reset");
>>> cancel_delayed_work_sync(&adev->delayed_init_work);
>>> @@ -3799,7 +3838,7 @@ int amdgpu_device_gpu_recover(struct
>>> amdgpu_device *adev,
>>> /* Build list of devices to reset */
>>> if (adev->gmc.xgmi.num_physical_nodes > 1) {
>>> if (!hive) {
>>> - amdgpu_device_unlock_adev(adev);
>>> + amdgpu_device_unlock_adev(adev, false);
>>> return -ENODEV;
>>> }
>>> @@ -3824,7 +3863,7 @@ int amdgpu_device_gpu_recover(struct
>>> amdgpu_device *adev,
>>> /* block all schedulers and reset given job's ring */
>>> list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
>>> /* disable ras on ALL IPs */
>>> - if (amdgpu_device_ip_need_full_reset(tmp_adev))
>>> + if (!in_ras_intr && amdgpu_device_ip_need_full_reset(tmp_adev))
>>> amdgpu_ras_suspend(tmp_adev);
>>> for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>>> @@ -3834,10 +3873,16 @@ int amdgpu_device_gpu_recover(struct
>>> amdgpu_device *adev,
>>> continue;
>>> drm_sched_stop(&ring->sched, job ? &job->base : NULL);
>>> +
>>> + if (in_ras_intr)
>>> + amdgpu_stop_all_jobs_on_sched(&ring->sched);
>>> }
>>> }
>>> + if (in_ras_intr)
>>> + goto skip_hw_reset;
>>> +
>>> /*
>>> * Must check guilty signal here since after this point all old
>>> * HW fences are force signaled.
>>> @@ -3902,34 +3947,37 @@ int amdgpu_device_gpu_recover(struct
>>> amdgpu_device *adev,
>>> /* Post ASIC reset for all devs .*/
>>> list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
>>> - for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>>> - struct amdgpu_ring *ring = tmp_adev->rings[i];
>>> - if (!ring || !ring->sched.thread)
>>> - continue;
>>> + if (!in_ras_intr) {
>> Maybe write it like this:
>>
>> if (in_ras_intr) {
>> amdgpu_device_unlock_adev(..)
>> continue;
>> }
>>
>> Or even better use a completely separate unlock loop.
>>
>> Christian.
>>
>>> + for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>>> + struct amdgpu_ring *ring = tmp_adev->rings[i];
>>> - /* No point to resubmit jobs if we didn't HW reset*/
>>> - if (!tmp_adev->asic_reset_res && !job_signaled)
>>> - drm_sched_resubmit_jobs(&ring->sched);
>>> + if (!ring || !ring->sched.thread)
>>> + continue;
>>> - drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
>>> - }
>>> + /* No point to resubmit jobs if we didn't HW reset*/
>>> + if (!tmp_adev->asic_reset_res && !job_signaled)
>>> + drm_sched_resubmit_jobs(&ring->sched);
>>> - if (!amdgpu_device_has_dc_support(tmp_adev) &&
>>> !job_signaled) {
>>> - drm_helper_resume_force_mode(tmp_adev->ddev);
>>> - }
>>> + drm_sched_start(&ring->sched,
>>> !tmp_adev->asic_reset_res);
>>> + }
>>> - tmp_adev->asic_reset_res = 0;
>>> + if (!amdgpu_device_has_dc_support(tmp_adev) &&
>>> !job_signaled) {
>>> + drm_helper_resume_force_mode(tmp_adev->ddev);
>>> + }
>>> - if (r) {
>>> - /* bad news, how to tell it to userspace ? */
>>> - dev_info(tmp_adev->dev, "GPU reset(%d) failed\n",
>>> atomic_read(&adev->gpu_reset_counter));
>>> - amdgpu_vf_error_put(tmp_adev,
>>> AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
>>> - } else {
>>> - dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n",
>>> atomic_read(&adev->gpu_reset_counter));
>>> + tmp_adev->asic_reset_res = 0;
>>> +
>>> + if (r) {
>>> + /* bad news, how to tell it to userspace ? */
>>> + dev_info(tmp_adev->dev, "GPU reset(%d) failed\n",
>>> atomic_read(&adev->gpu_reset_counter));
>>> + amdgpu_vf_error_put(tmp_adev,
>>> AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
>>> + } else {
>>> + dev_info(tmp_adev->dev, "GPU reset(%d)
>>> succeeded!\n", atomic_read(&adev->gpu_reset_counter));
>>> + }
>>> }
>>> - amdgpu_device_unlock_adev(tmp_adev);
>>> + amdgpu_device_unlock_adev(tmp_adev, in_ras_intr);
>>> }
>>> if (hive)
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> index 151d7f2..757fd6d 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> @@ -40,6 +40,8 @@
>>> #include "amdgpu_amdkfd.h"
>>> +#include "amdgpu_ras.h"
>>> +
>>> /*
>>> * KMS wrapper.
>>> * - 3.0.0 - initial driver
>>> @@ -1179,6 +1181,9 @@ amdgpu_pci_shutdown(struct pci_dev *pdev)
>>> struct drm_device *dev = pci_get_drvdata(pdev);
>>> struct amdgpu_device *adev = dev->dev_private;
>>> + if (amdgpu_ras_intr_triggered())
>>> + return;
>>> +
>>> /* if we are running in a VM, make sure the device
>>> * torn down properly on reboot/shutdown.
>>> * unfortunately we can't detect certain
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
>>> index da2143d..ced766c 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
>>> @@ -1046,6 +1046,12 @@ int amdgpu_driver_open_kms(struct drm_device
>>> *dev, struct drm_file *file_priv)
>>> /* Ensure IB tests are run on ring */
>>> flush_delayed_work(&adev->delayed_init_work);
>>> +
>>> + if (amdgpu_ras_intr_triggered()) {
>>> + DRM_ERROR("RAS Intr triggered, device disabled!!");
>>> + return -EHWPOISON;
>>> + }
>>> +
>>> file_priv->driver_priv = NULL;
>>> r = pm_runtime_get_sync(dev->dev);
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
>>> index 2d5897a..086e6df 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
>>> @@ -24,6 +24,8 @@
>>> #include <linux/debugfs.h>
>>> #include <linux/list.h>
>>> #include <linux/module.h>
>>> +#include <linux/reboot.h>
>>> +#include <linux/syscalls.h>
>>> #include "amdgpu.h"
>>> #include "amdgpu_ras.h"
>>> #include "amdgpu_atomfirmware.h"
>>> @@ -64,6 +66,9 @@ const char *ras_block_string[] = {
>>> /* inject address is 52 bits */
>>> #define RAS_UMC_INJECT_ADDR_LIMIT (0x1ULL << 52)
>>> +
>>> +atomic_t amdgpu_ras_in_intr = ATOMIC_INIT(0);
>>> +
>>> static int amdgpu_ras_reserve_vram(struct amdgpu_device *adev,
>>> uint64_t offset, uint64_t size,
>>> struct amdgpu_bo **bo_ptr);
>>> @@ -80,7 +85,7 @@ static ssize_t amdgpu_ras_debugfs_read(struct file
>>> *f, char __user *buf,
>>> ssize_t s;
>>> char val[128];
>>> - if (amdgpu_ras_error_query(obj->adev, &info))
>>> + if (amdgpu_ras_error_query(obj->adev, &info, false))
>>> return -EINVAL;
>>> s = snprintf(val, sizeof(val), "%s: %lu\n%s: %lu\n",
>>> @@ -188,6 +193,10 @@ static int
>>> amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
>>> return 0;
>>> }
>>> +
>>> +static struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device
>>> *adev,
>>> + struct ras_common_if *head);
>>> +
>>> /**
>>> * DOC: AMDGPU RAS debugfs control interface
>>> *
>>> @@ -304,7 +313,7 @@ static ssize_t amdgpu_ras_sysfs_read(struct
>>> device *dev,
>>> .head = obj->head,
>>> };
>>> - if (amdgpu_ras_error_query(obj->adev, &info))
>>> + if (amdgpu_ras_error_query(obj->adev, &info, false))
>>> return -EINVAL;
>>> return snprintf(buf, PAGE_SIZE, "%s: %lu\n%s: %lu\n",
>>> @@ -591,7 +600,7 @@ static int amdgpu_ras_enable_all_features(struct
>>> amdgpu_device *adev,
>>> /* query/inject/cure begin */
>>> int amdgpu_ras_error_query(struct amdgpu_device *adev,
>>> - struct ras_query_if *info)
>>> + struct ras_query_if *info, bool print)
>>> {
>>> struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
>>> struct ras_err_data err_data = {0, 0, 0, NULL};
>>> @@ -627,12 +636,14 @@ int amdgpu_ras_error_query(struct amdgpu_device
>>> *adev,
>>> info->ue_count = obj->err_data.ue_count;
>>> info->ce_count = obj->err_data.ce_count;
>>> - if (err_data.ce_count)
>>> + if (err_data.ce_count || print) {
>>> dev_info(adev->dev, "%ld correctable errors detected in %s
>>> block\n",
>>> obj->err_data.ce_count, ras_block_str(info->head.block));
>>> - if (err_data.ue_count)
>>> + }
>>> + if (err_data.ue_count || print) {
>>> dev_info(adev->dev, "%ld uncorrectable errors detected in
>>> %s block\n",
>>> obj->err_data.ue_count, ras_block_str(info->head.block));
>>> + }
>>> return 0;
>>> }
>>> @@ -702,7 +713,7 @@ int amdgpu_ras_query_error_count(struct
>>> amdgpu_device *adev,
>>> .head = obj->head,
>>> };
>>> - if (amdgpu_ras_error_query(adev, &info))
>>> + if (amdgpu_ras_error_query(adev, &info, true))
>>> return -EINVAL;
>>> data.ce_count += info.ce_count;
>>> @@ -1718,3 +1729,10 @@ int amdgpu_ras_fini(struct amdgpu_device *adev)
>>> return 0;
>>> }
>>> +
>>> +void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev)
>>> +{
>>> + if (atomic_cmpxchg(&amdgpu_ras_in_intr, 0, 1) == 0) {
>>> + DRM_WARN("RAS event of type ERREVENT_ATHUB_INTERRUPT
>>> detected! Stopping all GPU jobs.\n");
>>> + }
>>> +}
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
>>> index 5a0df73..c0e22af 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
>>> @@ -587,7 +587,7 @@ void amdgpu_ras_debugfs_remove(struct
>>> amdgpu_device *adev,
>>> struct ras_common_if *head);
>>> int amdgpu_ras_error_query(struct amdgpu_device *adev,
>>> - struct ras_query_if *info);
>>> + struct ras_query_if *info, bool print);
>>> int amdgpu_ras_error_inject(struct amdgpu_device *adev,
>>> struct ras_inject_if *info);
>>> @@ -600,4 +600,14 @@ int amdgpu_ras_interrupt_remove_handler(struct
>>> amdgpu_device *adev,
>>> int amdgpu_ras_interrupt_dispatch(struct amdgpu_device *adev,
>>> struct ras_dispatch_if *info);
>>> +
>>> +extern atomic_t amdgpu_ras_in_intr;
>>> +
>>> +static inline bool amdgpu_ras_intr_triggered(void)
>>> +{
>>> + return !!atomic_read(&amdgpu_ras_in_intr);
>>> +}
>>> +
>>> +void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev);
>>> +
>>> #endif
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>> index b2c86a0..e7a83f6 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>> @@ -5669,10 +5669,12 @@ static int
>>> gfx_v9_0_process_ras_data_cb(struct amdgpu_device *adev,
>>> struct amdgpu_iv_entry *entry)
>>> {
>>> /* TODO ue will trigger an interrupt. */
>>> - kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
>>> - if (adev->gfx.funcs->query_ras_error_count)
>>> - adev->gfx.funcs->query_ras_error_count(adev, err_data);
>>> - amdgpu_ras_reset_gpu(adev, 0);
>>> + if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX)) {
>>> + kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
>>> + if (adev->gfx.funcs->query_ras_error_count)
>>> + adev->gfx.funcs->query_ras_error_count(adev, err_data);
>>> + amdgpu_ras_reset_gpu(adev, 0);
>>> + }
>>> return AMDGPU_RAS_SUCCESS;
>>> }
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> index 43b4fbc..87a66c2 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> @@ -243,18 +243,20 @@ static int gmc_v9_0_process_ras_data_cb(struct
>>> amdgpu_device *adev,
>>> struct ras_err_data *err_data,
>>> struct amdgpu_iv_entry *entry)
>>> {
>>> - kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
>>> - if (adev->umc.funcs->query_ras_error_count)
>>> - adev->umc.funcs->query_ras_error_count(adev, err_data);
>>> - /* umc query_ras_error_address is also responsible for clearing
>>> - * error status
>>> - */
>>> - if (adev->umc.funcs->query_ras_error_address)
>>> - adev->umc.funcs->query_ras_error_address(adev, err_data);
>>> + if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX)) {
>>> + kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
>>> + if (adev->umc.funcs->query_ras_error_count)
>>> + adev->umc.funcs->query_ras_error_count(adev, err_data);
>>> + /* umc query_ras_error_address is also responsible for clearing
>>> + * error status
>>> + */
>>> + if (adev->umc.funcs->query_ras_error_address)
>>> + adev->umc.funcs->query_ras_error_address(adev, err_data);
>>> - /* only uncorrectable error needs gpu reset */
>>> - if (err_data->ue_count)
>>> - amdgpu_ras_reset_gpu(adev, 0);
>>> + /* only uncorrectable error needs gpu reset */
>>> + if (err_data->ue_count)
>>> + amdgpu_ras_reset_gpu(adev, 0);
>>> + }
>>> return AMDGPU_RAS_SUCCESS;
>>> }
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
>>> b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
>>> index 367f9d6..545990c 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
>>> @@ -30,6 +30,7 @@
>>> #include "nbio/nbio_7_4_0_smn.h"
>>> #include "ivsrcid/nbio/irqsrcs_nbif_7_4.h"
>>> #include <uapi/linux/kfd_ioctl.h>
>>> +#include "amdgpu_ras.h"
>>> #define smnNBIF_MGCG_CTRL_LCLK 0x1013a21c
>>> @@ -329,6 +330,8 @@ static void
>>> nbio_v7_4_handle_ras_controller_intr_no_bifring(struct amdgpu_device
>>> BIF_DOORBELL_INT_CNTL,
>>> RAS_CNTLR_INTERRUPT_CLEAR, 1);
>>> WREG32_SOC15(NBIO, 0, mmBIF_DOORBELL_INT_CNTL,
>>> bif_doorbell_intr_cntl);
>>> +
>>> + amdgpu_ras_global_ras_isr(adev);
>>> }
>>> }
>>> @@ -344,6 +347,8 @@ static void
>>> nbio_v7_4_handle_ras_err_event_athub_intr_no_bifring(struct amdgpu_d
>>> BIF_DOORBELL_INT_CNTL,
>>> RAS_ATHUB_ERR_EVENT_INTERRUPT_CLEAR, 1);
>>> WREG32_SOC15(NBIO, 0, mmBIF_DOORBELL_INT_CNTL,
>>> bif_doorbell_intr_cntl);
>>> +
>>> + amdgpu_ras_global_ras_isr(adev);
>>> }
>>> }
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
>>> b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
>>> index 956432f..438e504 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
>>> @@ -1972,24 +1972,26 @@ static int
>>> sdma_v4_0_process_ras_data_cb(struct amdgpu_device *adev,
>>> uint32_t err_source;
>>> int instance;
>>> - instance = sdma_v4_0_irq_id_to_seq(entry->client_id);
>>> - if (instance < 0)
>>> - return 0;
>>> + if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX)) {
>>> + instance = sdma_v4_0_irq_id_to_seq(entry->client_id);
>>> + if (instance < 0)
>>> + return 0;
>>> - switch (entry->src_id) {
>>> - case SDMA0_4_0__SRCID__SDMA_SRAM_ECC:
>>> - err_source = 0;
>>> - break;
>>> - case SDMA0_4_0__SRCID__SDMA_ECC:
>>> - err_source = 1;
>>> - break;
>>> - default:
>>> - return 0;
>>> - }
>>> + switch (entry->src_id) {
>>> + case SDMA0_4_0__SRCID__SDMA_SRAM_ECC:
>>> + err_source = 0;
>>> + break;
>>> + case SDMA0_4_0__SRCID__SDMA_ECC:
>>> + err_source = 1;
>>> + break;
>>> + default:
>>> + return 0;
>>> + }
>>> - kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
>>> + kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
>>> - amdgpu_ras_reset_gpu(adev, 0);
>>> + amdgpu_ras_reset_gpu(adev, 0);
>>> + }
>>> return AMDGPU_RAS_SUCCESS;
>>> }
More information about the amd-gfx
mailing list