[PATCH 1/2] dmr/amdgpu: Avoid HW GPU reset for RAS.

Koenig, Christian Christian.Koenig at amd.com
Thu Aug 29 14:06:20 UTC 2019


Am 29.08.19 um 16:03 schrieb Grodzovsky, Andrey:
> On 8/29/19 3:30 AM, Christian König wrote:
>> Am 28.08.19 um 22:00 schrieb Andrey Grodzovsky:
>>> Problem:
>>> Under certain conditions, when some IP bocks take a RAS error,
>>> we can get into a situation where a GPU reset is not possible
>>> due to issues in RAS in SMU/PSP.
>>>
>>> Temporary fix until proper solution in PSP/SMU is ready:
>>> When uncorrectable error happens the DF will unconditionally
>>> broadcast error event packets to all its clients/slave upon
>>> receiving fatal error event and freeze all its outbound queues,
>>> err_event_athub interrupt  will be triggered.
>>> In such case and we use this interrupt
>>> to issue GPU reset. THe GPU reset code is modified for such case to
>>> avoid HW
>>> reset, only stops schedulers, deatches all in progress and not yet
>>> scheduled
>>> job's fences, set error code on them and signals.
>>> Also reject any new incoming job submissions from user space.
>>> All this is done to notify the applications of the problem.
>>>
>>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky at amd.com>
>>> ---
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c     |  4 ++
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 98
>>> ++++++++++++++++++++++--------
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    |  5 ++
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c    |  6 ++
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c    | 30 +++++++--
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h    | 12 +++-
>>>    drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c      | 10 +--
>>>    drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c      | 24 ++++----
>>>    drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c     |  5 ++
>>>    drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c     | 32 +++++-----
>>>    10 files changed, 164 insertions(+), 62 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>> index 9da681e..300adb8 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>> @@ -38,6 +38,7 @@
>>>    #include "amdgpu_gmc.h"
>>>    #include "amdgpu_gem.h"
>>>    #include "amdgpu_display.h"
>>> +#include "amdgpu_ras.h"
>>>      #if defined(HAVE_DRM_FREE_LARGE)
>>>    #define kvfree drm_free_large
>>> @@ -1461,6 +1462,9 @@ int amdgpu_cs_ioctl(struct drm_device *dev,
>>> void *data, struct drm_file *filp)
>>>        bool reserved_buffers = false;
>>>        int i, r;
>>>    +    if (amdgpu_ras_intr_triggered())
>>> +        return -EHWPOISON;
>>> +
>>>        if (!adev->accel_working)
>>>            return -EBUSY;
>>>    diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> index 07a4ba0..3ecee10 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> @@ -3734,10 +3734,10 @@ static bool amdgpu_device_lock_adev(struct
>>> amdgpu_device *adev, bool trylock)
>>>        return true;
>>>    }
>>>    -static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
>>> +static void amdgpu_device_unlock_adev(struct amdgpu_device *adev,
>>> bool skip_kfd)
>>>    {
>>>        /*unlock kfd: SRIOV would do it separately */
>>> -    if (!amdgpu_sriov_vf(adev))
>>> +    if (!amdgpu_sriov_vf(adev) && !skip_kfd)
>>>                    amdgpu_amdkfd_post_reset(adev);
>> It's most likely better to completely remove the call to
>> amdgpu_amdkfd_post_reset() here.
>
> Felix advised that the way to stop all KFD activity is simply to NOT
> call amdgpu_amdkfd_post_reset so that why I added this. Do you mean you
> prefer amdgpu_amdkfd_post_reset to be outside of amdgpu_device_unlock_adev ?

Yes, exactly. It doesn't seems to be related to the unlock operation in 
the first place, but rather only signals the KFD that the reset is 
completed.

Christian.

>
>
>>>        amdgpu_vf_error_trans_all(adev);
>>>        adev->mp1_state = PP_MP1_STATE_NONE;
>>> @@ -3746,6 +3746,44 @@ static void amdgpu_device_unlock_adev(struct
>>> amdgpu_device *adev)
>>>    }
>>>      +#define to_drm_sched_job(sched_job)        \
>>> +        container_of((sched_job), struct drm_sched_job, queue_node)
>>> +
>>> +static void amdgpu_stop_all_jobs_on_sched(struct drm_gpu_scheduler
>>> *sched)
>>> +{
>>> +    struct drm_sched_job *s_job;
>>> +    struct drm_sched_entity *s_entity = NULL;
>>> +    int i;
>>> +
>>> +    /* Signal all jobs not yet scheduled */
>>> +    for (i = DRM_SCHED_PRIORITY_MAX - 1; i >=
>>> DRM_SCHED_PRIORITY_MIN; i--) {
>>> +        struct drm_sched_rq *rq = &sched->sched_rq[i];
>>> +
>>> +        if (!rq)
>>> +            continue;
>>> +
>>> +        spin_lock(&rq->lock);
>>> +        list_for_each_entry(s_entity, &rq->entities, list) {
>>> +            while ((s_job =
>>> to_drm_sched_job(spsc_queue_pop(&s_entity->job_queue)))) {
>>> +                struct drm_sched_fence *s_fence = s_job->s_fence;
>>> +
>>> +                dma_fence_signal(&s_fence->scheduled);
>>> +                dma_fence_set_error(&s_fence->finished, -EHWPOISON);
>>> +                dma_fence_signal(&s_fence->finished);
>>> +            }
>>> +        }
>>> +        spin_unlock(&rq->lock);
>>> +    }
>>> +
>>> +    /* Signal all jobs already scheduled to HW */
>>> +    list_for_each_entry(s_job, &sched->ring_mirror_list, node) {
>>> +        struct drm_sched_fence *s_fence = s_job->s_fence;
>>> +
>>> +        dma_fence_set_error(&s_fence->finished, -EHWPOISON);
>>> +        dma_fence_signal(&s_fence->finished);
>>> +    }
>>> +}
>> That might be better put into amdgpu_job.c.
>>
>> And I assume this is called only during GPU reset will the scheduler
>> fully stopped?
>
> Yes
>
>
>>> +
>>>    /**
>>>     * amdgpu_device_gpu_recover - reset the asic and recover scheduler
>>>     *
>>> @@ -3765,11 +3803,12 @@ int amdgpu_device_gpu_recover(struct
>>> amdgpu_device *adev,
>>>        struct amdgpu_hive_info *hive = NULL;
>>>        struct amdgpu_device *tmp_adev = NULL;
>>>        int i, r = 0;
>>> +    bool in_ras_intr = amdgpu_ras_intr_triggered();
>>>          need_full_reset = job_signaled = false;
>>>        INIT_LIST_HEAD(&device_list);
>>>    -    dev_info(adev->dev, "GPU reset begin!\n");
>>> +    dev_info(adev->dev, "GPU %s begin!\n", in_ras_intr ? "jobs
>>> stop":"reset");
>>> cancel_delayed_work_sync(&adev->delayed_init_work);
>>>    @@ -3799,7 +3838,7 @@ int amdgpu_device_gpu_recover(struct
>>> amdgpu_device *adev,
>>>        /* Build list of devices to reset */
>>>        if  (adev->gmc.xgmi.num_physical_nodes > 1) {
>>>            if (!hive) {
>>> -            amdgpu_device_unlock_adev(adev);
>>> +            amdgpu_device_unlock_adev(adev, false);
>>>                return -ENODEV;
>>>            }
>>>    @@ -3824,7 +3863,7 @@ int amdgpu_device_gpu_recover(struct
>>> amdgpu_device *adev,
>>>        /* block all schedulers and reset given job's ring */
>>>        list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
>>>            /* disable ras on ALL IPs */
>>> -        if (amdgpu_device_ip_need_full_reset(tmp_adev))
>>> +        if (!in_ras_intr && amdgpu_device_ip_need_full_reset(tmp_adev))
>>>                amdgpu_ras_suspend(tmp_adev);
>>>              for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>>> @@ -3834,10 +3873,16 @@ int amdgpu_device_gpu_recover(struct
>>> amdgpu_device *adev,
>>>                    continue;
>>>                  drm_sched_stop(&ring->sched, job ? &job->base : NULL);
>>> +
>>> +            if (in_ras_intr)
>>> + amdgpu_stop_all_jobs_on_sched(&ring->sched);
>>>            }
>>>        }
>>>      +    if (in_ras_intr)
>>> +        goto skip_hw_reset;
>>> +
>>>        /*
>>>         * Must check guilty signal here since after this point all old
>>>         * HW fences are force signaled.
>>> @@ -3902,34 +3947,37 @@ int amdgpu_device_gpu_recover(struct
>>> amdgpu_device *adev,
>>>          /* Post ASIC reset for all devs .*/
>>>        list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
>>> -        for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>>> -            struct amdgpu_ring *ring = tmp_adev->rings[i];
>>>    -            if (!ring || !ring->sched.thread)
>>> -                continue;
>>> +        if (!in_ras_intr) {
>> Maybe write it like this:
>>
>> if (in_ras_intr) {
>>      amdgpu_device_unlock_adev(..)
>>      continue;
>> }
>>
>> Or even better use a completely separate unlock loop.
>>
>> Christian.
>>
>>> +            for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>>> +                struct amdgpu_ring *ring = tmp_adev->rings[i];
>>>    -            /* No point to resubmit jobs if we didn't HW reset*/
>>> -            if (!tmp_adev->asic_reset_res && !job_signaled)
>>> -                drm_sched_resubmit_jobs(&ring->sched);
>>> +                if (!ring || !ring->sched.thread)
>>> +                    continue;
>>>    -            drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
>>> -        }
>>> +                /* No point to resubmit jobs if we didn't HW reset*/
>>> +                if (!tmp_adev->asic_reset_res && !job_signaled)
>>> + drm_sched_resubmit_jobs(&ring->sched);
>>>    -        if (!amdgpu_device_has_dc_support(tmp_adev) &&
>>> !job_signaled) {
>>> -            drm_helper_resume_force_mode(tmp_adev->ddev);
>>> -        }
>>> +                drm_sched_start(&ring->sched,
>>> !tmp_adev->asic_reset_res);
>>> +            }
>>>    -        tmp_adev->asic_reset_res = 0;
>>> +            if (!amdgpu_device_has_dc_support(tmp_adev) &&
>>> !job_signaled) {
>>> + drm_helper_resume_force_mode(tmp_adev->ddev);
>>> +            }
>>>    -        if (r) {
>>> -            /* bad news, how to tell it to userspace ? */
>>> -            dev_info(tmp_adev->dev, "GPU reset(%d) failed\n",
>>> atomic_read(&adev->gpu_reset_counter));
>>> -            amdgpu_vf_error_put(tmp_adev,
>>> AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
>>> -        } else {
>>> -            dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n",
>>> atomic_read(&adev->gpu_reset_counter));
>>> +            tmp_adev->asic_reset_res = 0;
>>> +
>>> +            if (r) {
>>> +                /* bad news, how to tell it to userspace ? */
>>> +                dev_info(tmp_adev->dev, "GPU reset(%d) failed\n",
>>> atomic_read(&adev->gpu_reset_counter));
>>> +                amdgpu_vf_error_put(tmp_adev,
>>> AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
>>> +            } else {
>>> +                dev_info(tmp_adev->dev, "GPU reset(%d)
>>> succeeded!\n", atomic_read(&adev->gpu_reset_counter));
>>> +            }
>>>            }
>>>    -        amdgpu_device_unlock_adev(tmp_adev);
>>> +        amdgpu_device_unlock_adev(tmp_adev, in_ras_intr);
>>>        }
>>>          if (hive)
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> index 151d7f2..757fd6d 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> @@ -40,6 +40,8 @@
>>>      #include "amdgpu_amdkfd.h"
>>>    +#include "amdgpu_ras.h"
>>> +
>>>    /*
>>>     * KMS wrapper.
>>>     * - 3.0.0 - initial driver
>>> @@ -1179,6 +1181,9 @@ amdgpu_pci_shutdown(struct pci_dev *pdev)
>>>        struct drm_device *dev = pci_get_drvdata(pdev);
>>>        struct amdgpu_device *adev = dev->dev_private;
>>>    +    if (amdgpu_ras_intr_triggered())
>>> +        return;
>>> +
>>>        /* if we are running in a VM, make sure the device
>>>         * torn down properly on reboot/shutdown.
>>>         * unfortunately we can't detect certain
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
>>> index da2143d..ced766c 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
>>> @@ -1046,6 +1046,12 @@ int amdgpu_driver_open_kms(struct drm_device
>>> *dev, struct drm_file *file_priv)
>>>        /* Ensure IB tests are run on ring */
>>>        flush_delayed_work(&adev->delayed_init_work);
>>>    +
>>> +    if (amdgpu_ras_intr_triggered()) {
>>> +        DRM_ERROR("RAS Intr triggered, device disabled!!");
>>> +        return -EHWPOISON;
>>> +    }
>>> +
>>>        file_priv->driver_priv = NULL;
>>>          r = pm_runtime_get_sync(dev->dev);
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
>>> index 2d5897a..086e6df 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
>>> @@ -24,6 +24,8 @@
>>>    #include <linux/debugfs.h>
>>>    #include <linux/list.h>
>>>    #include <linux/module.h>
>>> +#include <linux/reboot.h>
>>> +#include <linux/syscalls.h>
>>>    #include "amdgpu.h"
>>>    #include "amdgpu_ras.h"
>>>    #include "amdgpu_atomfirmware.h"
>>> @@ -64,6 +66,9 @@ const char *ras_block_string[] = {
>>>    /* inject address is 52 bits */
>>>    #define    RAS_UMC_INJECT_ADDR_LIMIT    (0x1ULL << 52)
>>>    +
>>> +atomic_t amdgpu_ras_in_intr = ATOMIC_INIT(0);
>>> +
>>>    static int amdgpu_ras_reserve_vram(struct amdgpu_device *adev,
>>>            uint64_t offset, uint64_t size,
>>>            struct amdgpu_bo **bo_ptr);
>>> @@ -80,7 +85,7 @@ static ssize_t amdgpu_ras_debugfs_read(struct file
>>> *f, char __user *buf,
>>>        ssize_t s;
>>>        char val[128];
>>>    -    if (amdgpu_ras_error_query(obj->adev, &info))
>>> +    if (amdgpu_ras_error_query(obj->adev, &info, false))
>>>            return -EINVAL;
>>>          s = snprintf(val, sizeof(val), "%s: %lu\n%s: %lu\n",
>>> @@ -188,6 +193,10 @@ static int
>>> amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
>>>          return 0;
>>>    }
>>> +
>>> +static struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device
>>> *adev,
>>> +        struct ras_common_if *head);
>>> +
>>>    /**
>>>     * DOC: AMDGPU RAS debugfs control interface
>>>     *
>>> @@ -304,7 +313,7 @@ static ssize_t amdgpu_ras_sysfs_read(struct
>>> device *dev,
>>>            .head = obj->head,
>>>        };
>>>    -    if (amdgpu_ras_error_query(obj->adev, &info))
>>> +    if (amdgpu_ras_error_query(obj->adev, &info, false))
>>>            return -EINVAL;
>>>          return snprintf(buf, PAGE_SIZE, "%s: %lu\n%s: %lu\n",
>>> @@ -591,7 +600,7 @@ static int amdgpu_ras_enable_all_features(struct
>>> amdgpu_device *adev,
>>>      /* query/inject/cure begin */
>>>    int amdgpu_ras_error_query(struct amdgpu_device *adev,
>>> -        struct ras_query_if *info)
>>> +        struct ras_query_if *info, bool print)
>>>    {
>>>        struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
>>>        struct ras_err_data err_data = {0, 0, 0, NULL};
>>> @@ -627,12 +636,14 @@ int amdgpu_ras_error_query(struct amdgpu_device
>>> *adev,
>>>        info->ue_count = obj->err_data.ue_count;
>>>        info->ce_count = obj->err_data.ce_count;
>>>    -    if (err_data.ce_count)
>>> +    if (err_data.ce_count || print) {
>>>            dev_info(adev->dev, "%ld correctable errors detected in %s
>>> block\n",
>>>                 obj->err_data.ce_count, ras_block_str(info->head.block));
>>> -    if (err_data.ue_count)
>>> +    }
>>> +    if (err_data.ue_count || print) {
>>>            dev_info(adev->dev, "%ld uncorrectable errors detected in
>>> %s block\n",
>>>                 obj->err_data.ue_count, ras_block_str(info->head.block));
>>> +    }
>>>          return 0;
>>>    }
>>> @@ -702,7 +713,7 @@ int amdgpu_ras_query_error_count(struct
>>> amdgpu_device *adev,
>>>                .head = obj->head,
>>>            };
>>>    -        if (amdgpu_ras_error_query(adev, &info))
>>> +        if (amdgpu_ras_error_query(adev, &info, true))
>>>                return -EINVAL;
>>>              data.ce_count += info.ce_count;
>>> @@ -1718,3 +1729,10 @@ int amdgpu_ras_fini(struct amdgpu_device *adev)
>>>          return 0;
>>>    }
>>> +
>>> +void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev)
>>> +{
>>> +    if (atomic_cmpxchg(&amdgpu_ras_in_intr, 0, 1) == 0) {
>>> +        DRM_WARN("RAS event of type ERREVENT_ATHUB_INTERRUPT
>>> detected! Stopping all GPU jobs.\n");
>>> +    }
>>> +}
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
>>> index 5a0df73..c0e22af 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
>>> @@ -587,7 +587,7 @@ void amdgpu_ras_debugfs_remove(struct
>>> amdgpu_device *adev,
>>>            struct ras_common_if *head);
>>>      int amdgpu_ras_error_query(struct amdgpu_device *adev,
>>> -        struct ras_query_if *info);
>>> +        struct ras_query_if *info, bool print);
>>>      int amdgpu_ras_error_inject(struct amdgpu_device *adev,
>>>            struct ras_inject_if *info);
>>> @@ -600,4 +600,14 @@ int amdgpu_ras_interrupt_remove_handler(struct
>>> amdgpu_device *adev,
>>>      int amdgpu_ras_interrupt_dispatch(struct amdgpu_device *adev,
>>>            struct ras_dispatch_if *info);
>>> +
>>> +extern atomic_t amdgpu_ras_in_intr;
>>> +
>>> +static inline bool amdgpu_ras_intr_triggered(void)
>>> +{
>>> +    return !!atomic_read(&amdgpu_ras_in_intr);
>>> +}
>>> +
>>> +void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev);
>>> +
>>>    #endif
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>> index b2c86a0..e7a83f6 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>> @@ -5669,10 +5669,12 @@ static int
>>> gfx_v9_0_process_ras_data_cb(struct amdgpu_device *adev,
>>>            struct amdgpu_iv_entry *entry)
>>>    {
>>>        /* TODO ue will trigger an interrupt. */
>>> -    kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
>>> -    if (adev->gfx.funcs->query_ras_error_count)
>>> -        adev->gfx.funcs->query_ras_error_count(adev, err_data);
>>> -    amdgpu_ras_reset_gpu(adev, 0);
>>> +    if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX)) {
>>> +        kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
>>> +        if (adev->gfx.funcs->query_ras_error_count)
>>> +            adev->gfx.funcs->query_ras_error_count(adev, err_data);
>>> +        amdgpu_ras_reset_gpu(adev, 0);
>>> +    }
>>>        return AMDGPU_RAS_SUCCESS;
>>>    }
>>>    diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> index 43b4fbc..87a66c2 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> @@ -243,18 +243,20 @@ static int gmc_v9_0_process_ras_data_cb(struct
>>> amdgpu_device *adev,
>>>            struct ras_err_data *err_data,
>>>            struct amdgpu_iv_entry *entry)
>>>    {
>>> -    kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
>>> -    if (adev->umc.funcs->query_ras_error_count)
>>> -        adev->umc.funcs->query_ras_error_count(adev, err_data);
>>> -    /* umc query_ras_error_address is also responsible for clearing
>>> -     * error status
>>> -     */
>>> -    if (adev->umc.funcs->query_ras_error_address)
>>> -        adev->umc.funcs->query_ras_error_address(adev, err_data);
>>> +    if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX)) {
>>> +        kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
>>> +        if (adev->umc.funcs->query_ras_error_count)
>>> +            adev->umc.funcs->query_ras_error_count(adev, err_data);
>>> +        /* umc query_ras_error_address is also responsible for clearing
>>> +         * error status
>>> +         */
>>> +        if (adev->umc.funcs->query_ras_error_address)
>>> + adev->umc.funcs->query_ras_error_address(adev, err_data);
>>>    -    /* only uncorrectable error needs gpu reset */
>>> -    if (err_data->ue_count)
>>> -        amdgpu_ras_reset_gpu(adev, 0);
>>> +        /* only uncorrectable error needs gpu reset */
>>> +        if (err_data->ue_count)
>>> +            amdgpu_ras_reset_gpu(adev, 0);
>>> +    }
>>>          return AMDGPU_RAS_SUCCESS;
>>>    }
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
>>> b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
>>> index 367f9d6..545990c 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
>>> @@ -30,6 +30,7 @@
>>>    #include "nbio/nbio_7_4_0_smn.h"
>>>    #include "ivsrcid/nbio/irqsrcs_nbif_7_4.h"
>>>    #include <uapi/linux/kfd_ioctl.h>
>>> +#include "amdgpu_ras.h"
>>>      #define smnNBIF_MGCG_CTRL_LCLK    0x1013a21c
>>>    @@ -329,6 +330,8 @@ static void
>>> nbio_v7_4_handle_ras_controller_intr_no_bifring(struct amdgpu_device
>>>                            BIF_DOORBELL_INT_CNTL,
>>>                            RAS_CNTLR_INTERRUPT_CLEAR, 1);
>>>            WREG32_SOC15(NBIO, 0, mmBIF_DOORBELL_INT_CNTL,
>>> bif_doorbell_intr_cntl);
>>> +
>>> +        amdgpu_ras_global_ras_isr(adev);
>>>        }
>>>    }
>>>    @@ -344,6 +347,8 @@ static void
>>> nbio_v7_4_handle_ras_err_event_athub_intr_no_bifring(struct amdgpu_d
>>>                            BIF_DOORBELL_INT_CNTL,
>>>                            RAS_ATHUB_ERR_EVENT_INTERRUPT_CLEAR, 1);
>>>            WREG32_SOC15(NBIO, 0, mmBIF_DOORBELL_INT_CNTL,
>>> bif_doorbell_intr_cntl);
>>> +
>>> +        amdgpu_ras_global_ras_isr(adev);
>>>        }
>>>    }
>>>    diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
>>> b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
>>> index 956432f..438e504 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
>>> @@ -1972,24 +1972,26 @@ static int
>>> sdma_v4_0_process_ras_data_cb(struct amdgpu_device *adev,
>>>        uint32_t err_source;
>>>        int instance;
>>>    -    instance = sdma_v4_0_irq_id_to_seq(entry->client_id);
>>> -    if (instance < 0)
>>> -        return 0;
>>> +    if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX)) {
>>> +        instance = sdma_v4_0_irq_id_to_seq(entry->client_id);
>>> +        if (instance < 0)
>>> +            return 0;
>>>    -    switch (entry->src_id) {
>>> -    case SDMA0_4_0__SRCID__SDMA_SRAM_ECC:
>>> -        err_source = 0;
>>> -        break;
>>> -    case SDMA0_4_0__SRCID__SDMA_ECC:
>>> -        err_source = 1;
>>> -        break;
>>> -    default:
>>> -        return 0;
>>> -    }
>>> +        switch (entry->src_id) {
>>> +        case SDMA0_4_0__SRCID__SDMA_SRAM_ECC:
>>> +            err_source = 0;
>>> +            break;
>>> +        case SDMA0_4_0__SRCID__SDMA_ECC:
>>> +            err_source = 1;
>>> +            break;
>>> +        default:
>>> +            return 0;
>>> +        }
>>>    -    kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
>>> +        kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
>>>    -    amdgpu_ras_reset_gpu(adev, 0);
>>> +        amdgpu_ras_reset_gpu(adev, 0);
>>> +    }
>>>          return AMDGPU_RAS_SUCCESS;
>>>    }



More information about the amd-gfx mailing list