[PATCH 3/3] drm/amdgpu: Avoid HW reset if guilty job already signaled.

Wed Apr 10 14:06:04 UTC 2019

Am 09.04.19 um 18:42 schrieb Grodzovsky, Andrey:
> On 4/9/19 10:50 AM, Christian König wrote:
>> Am 08.04.19 um 18:08 schrieb Andrey Grodzovsky:
>>> Also reject TDRs if another one already running.
>>>
>>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky at amd.com>
>>> ---
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 94
>>> +++++++++++++++++++++---------
>>>    1 file changed, 67 insertions(+), 27 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> index aabd043..4446892 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> @@ -3327,10 +3327,12 @@ bool amdgpu_device_should_recover_gpu(struct
>>> amdgpu_device *adev)
>>>      static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>>>                        struct amdgpu_job *job,
>>> -                    bool *need_full_reset_arg)
>>> +                    bool *need_full_reset_arg,
>>> +                    bool *job_signaled)
>>>    {
>>>        int i, r = 0;
>>>        bool need_full_reset  = *need_full_reset_arg;
>>> +    struct amdgpu_ring *job_ring = job ?
>>> to_amdgpu_ring(job->base.sched) : NULL;
>>>          /* block all schedulers and reset given job's ring */
>>>        for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>>> @@ -3341,6 +3343,17 @@ static int amdgpu_device_pre_asic_reset(struct
>>> amdgpu_device *adev,
>>>              drm_sched_stop(&ring->sched, &job->base);
>>>    +        /*
>>> +         * Must check guilty signal here since after this point all old
>>> +         * HW fences are force signaled.
>>> +         *
>>> +         * job->base holds a reference to parent fence
>>> +         */
>>> +        if (job_signaled && job && ring == job_ring &&
>>> +            job->base.s_fence->parent &&
>>> + dma_fence_is_signaled(job->base.s_fence->parent))
>>> +            *job_signaled = true;
>>> +
>> That won't work correctly. See when the guilty job is not on the first
>> scheduler, you would already have force completed some before getting
>> here.
>>
>> Better to stop all schedulers first and then do the check.
>>
>> Christian.
>
> What do you mean by first scheduler ? There is one scheduler object per
> ring so I am not clear what 'first' means here.

Well for example if the guilty job is from a compute ring the we have 
already force signaled the gfx ring here.

Same is true for other devices in the same hive, so it would probably be 
a good idea to move the force signaling and the IP reset somewhere else 
and this check up a layer.

Christian.

>
> Andrey
>
>
>>>            /* after all hw jobs are reset, hw fence is meaningless, so
>>> force_completion */
>>>            amdgpu_fence_driver_force_completion(ring);
>>>        }
>>> @@ -3358,7 +3371,8 @@ static int amdgpu_device_pre_asic_reset(struct
>>> amdgpu_device *adev,
>>>        -    if (!amdgpu_sriov_vf(adev)) {
>>> +    /* Don't suspend on bare metal if we are not going to HW reset
>>> the ASIC */
>>> +    if (!amdgpu_sriov_vf(adev) && !(*job_signaled)) {
>>>              if (!need_full_reset)
>>>                need_full_reset = amdgpu_device_ip_need_full_reset(adev);
>>> @@ -3495,7 +3509,7 @@ static int amdgpu_do_asic_reset(struct
>>> amdgpu_hive_info *hive,
>>>        return r;
>>>    }
>>>    -static void amdgpu_device_post_asic_reset(struct amdgpu_device *adev)
>>> +static void amdgpu_device_post_asic_reset(struct amdgpu_device
>>> *adev, bool job_signaled)
>>>    {
>>>        int i;
>>>    @@ -3505,7 +3519,8 @@ static void
>>> amdgpu_device_post_asic_reset(struct amdgpu_device *adev)
>>>            if (!ring || !ring->sched.thread)
>>>                continue;
>>>    -        if (!adev->asic_reset_res)
>>> +        /* No point to resubmit jobs if we didn't HW reset*/
>>> +        if (!adev->asic_reset_res && !job_signaled)
>>>                drm_sched_resubmit_jobs(&ring->sched);
>>>              drm_sched_start(&ring->sched, !adev->asic_reset_res);
>>> @@ -3518,14 +3533,21 @@ static void
>>> amdgpu_device_post_asic_reset(struct amdgpu_device *adev)
>>>        adev->asic_reset_res = 0;
>>>    }
>>>    -static void amdgpu_device_lock_adev(struct amdgpu_device *adev)
>>> +static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, bool
>>> trylock)
>>>    {
>>> -    mutex_lock(&adev->lock_reset);
>>> +    if (trylock) {
>>> +        if (!mutex_trylock(&adev->lock_reset))
>>> +            return false;
>>> +    } else
>>> +        mutex_lock(&adev->lock_reset);
>>> +
>>>        atomic_inc(&adev->gpu_reset_counter);
>>>        adev->in_gpu_reset = 1;
>>>        /* Block kfd: SRIOV would do it separately */
>>>        if (!amdgpu_sriov_vf(adev))
>>>                    amdgpu_amdkfd_pre_reset(adev);
>>> +
>>> +    return true;
>>>    }
>>>      static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
>>> @@ -3555,29 +3577,44 @@ int amdgpu_device_gpu_recover(struct
>>> amdgpu_device *adev,
>>>    {
>>>        int r;
>>>        struct amdgpu_hive_info *hive = NULL;
>>> -    bool need_full_reset = false;
>>>        struct amdgpu_device *tmp_adev = NULL;
>>>        struct list_head device_list, *device_list_handle =  NULL;
>>> +    bool xgmi_topology_present, need_full_reset, job_signaled;
>>>    +    need_full_reset = job_signaled = false;
>>>        INIT_LIST_HEAD(&device_list);
>>>          dev_info(adev->dev, "GPU reset begin!\n");
>>>    +    hive = amdgpu_get_xgmi_hive(adev, 0);
>>> +    xgmi_topology_present = hive &&
>>> adev->gmc.xgmi.num_physical_nodes > 1;
>>> +
>>>        /*
>>> -     * In case of XGMI hive disallow concurrent resets to be triggered
>>> -     * by different nodes. No point also since the one node already
>>> executing
>>> -     * reset will also reset all the other nodes in the hive.
>>> +     * Here we trylock to avoid chain of resets executing from
>>> +     * either trigger by jobs on different adevs in XGMI hive or
>>> jobs on
>>> +     * different schedulers for same device while this tdr is running.
>>> +     * We always reset all schedulers for device and all devices for
>>> XGMI
>>> +     * hive so that should take care of them too.
>>>         */
>>> -    hive = amdgpu_get_xgmi_hive(adev, 0);
>>> -    if (hive && adev->gmc.xgmi.num_physical_nodes > 1 &&
>>> -        !mutex_trylock(&hive->reset_lock))
>>> +
>>> +    if (xgmi_topology_present && !mutex_trylock(&hive->reset_lock)) {
>>> +        DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as
>>> another already in progress",
>>> +             job->base.id, hive->hive_id);
>>>            return 0;
>>> +    }
>>>          /* Start with adev pre asic reset first for soft reset check.*/
>>> -    amdgpu_device_lock_adev(adev);
>>> +    if (!amdgpu_device_lock_adev(adev, !xgmi_topology_present)) {
>>> +        DRM_INFO("Bailing on TDR for s_job:%llx, as another already
>>> in progress",
>>> +                     job->base.id);
>>> +        return 0;
>>> +    }
>>> +
>>> +    /* Guilty job will be freed after this*/
>>>        r = amdgpu_device_pre_asic_reset(adev,
>>>                         job,
>>> -                     &need_full_reset);
>>> +                     &need_full_reset,
>>> +                     &job_signaled);
>>>        if (r) {
>>>            /*TODO Should we stop ?*/
>>>            DRM_ERROR("GPU pre asic reset failed with err, %d for drm
>>> dev, %s ",
>>> @@ -3609,10 +3646,11 @@ int amdgpu_device_gpu_recover(struct
>>> amdgpu_device *adev,
>>>            if (tmp_adev == adev)
>>>                continue;
>>>    -        amdgpu_device_lock_adev(tmp_adev);
>>> +        amdgpu_device_lock_adev(tmp_adev, false);
>>>            r = amdgpu_device_pre_asic_reset(tmp_adev,
>>>                             NULL,
>>> -                         &need_full_reset);
>>> +                         &need_full_reset,
>>> +                         &job_signaled);
>>>            /*TODO Should we stop ?*/
>>>            if (r) {
>>>                DRM_ERROR("GPU pre asic reset failed with err, %d for
>>> drm dev, %s ",
>>> @@ -3623,19 +3661,21 @@ int amdgpu_device_gpu_recover(struct
>>> amdgpu_device *adev,
>>>          /* Actual ASIC resets if needed.*/
>>>        /* TODO Implement XGMI hive reset logic for SRIOV */
>>> -    if (amdgpu_sriov_vf(adev)) {
>>> -        r = amdgpu_device_reset_sriov(adev, job ? false : true);
>>> -        if (r)
>>> -            adev->asic_reset_res = r;
>>> -    } else {
>>> -        r  = amdgpu_do_asic_reset(hive, device_list_handle,
>>> &need_full_reset);
>>> -        if (r && r == -EAGAIN)
>>> -            goto retry;
>>> +    if (!job || !job_signaled) {
>>> +        if (amdgpu_sriov_vf(adev)) {
>>> +            r = amdgpu_device_reset_sriov(adev, job ? false : true);
>>> +            if (r)
>>> +                adev->asic_reset_res = r;
>>> +        } else {
>>> +            r  = amdgpu_do_asic_reset(hive, device_list_handle,
>>> &need_full_reset);
>>> +            if (r && r == -EAGAIN)
>>> +                goto retry;
>>> +        }
>>>        }
>>>          /* Post ASIC reset for all devs .*/
>>>        list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
>>> -        amdgpu_device_post_asic_reset(tmp_adev);
>>> +        amdgpu_device_post_asic_reset(tmp_adev, job_signaled);
>>>              if (r) {
>>>                /* bad news, how to tell it to userspace ? */
>>> @@ -3648,7 +3688,7 @@ int amdgpu_device_gpu_recover(struct
>>> amdgpu_device *adev,
>>>            amdgpu_device_unlock_adev(tmp_adev);
>>>        }
>>>    -    if (hive && adev->gmc.xgmi.num_physical_nodes > 1)
>>> +    if (xgmi_topology_present)
>>>            mutex_unlock(&hive->reset_lock);
>>>          if (r)
> _______________________________________________
> amd-gfx mailing list
> amd-gfx at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx