[PATCH 3/3] drm/amdgpu: Avoid HW reset if guilty job already signaled.
Grodzovsky, Andrey
Andrey.Grodzovsky at amd.com
Wed Apr 10 15:05:24 UTC 2019
On 4/10/19 10:41 AM, Christian König wrote:
> Am 10.04.19 um 16:28 schrieb Grodzovsky, Andrey:
>> On 4/10/19 10:06 AM, Christian König wrote:
>>> Am 09.04.19 um 18:42 schrieb Grodzovsky, Andrey:
>>>> On 4/9/19 10:50 AM, Christian König wrote:
>>>>> Am 08.04.19 um 18:08 schrieb Andrey Grodzovsky:
>>>>>> Also reject TDRs if another one already running.
>>>>>>
>>>>>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky at amd.com>
>>>>>> ---
>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 94
>>>>>> +++++++++++++++++++++---------
>>>>>> 1 file changed, 67 insertions(+), 27 deletions(-)
>>>>>>
>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>> index aabd043..4446892 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>> @@ -3327,10 +3327,12 @@ bool amdgpu_device_should_recover_gpu(struct
>>>>>> amdgpu_device *adev)
>>>>>> static int amdgpu_device_pre_asic_reset(struct amdgpu_device
>>>>>> *adev,
>>>>>> struct amdgpu_job *job,
>>>>>> - bool *need_full_reset_arg)
>>>>>> + bool *need_full_reset_arg,
>>>>>> + bool *job_signaled)
>>>>>> {
>>>>>> int i, r = 0;
>>>>>> bool need_full_reset = *need_full_reset_arg;
>>>>>> + struct amdgpu_ring *job_ring = job ?
>>>>>> to_amdgpu_ring(job->base.sched) : NULL;
>>>>>> /* block all schedulers and reset given job's ring */
>>>>>> for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>>>>>> @@ -3341,6 +3343,17 @@ static int
>>>>>> amdgpu_device_pre_asic_reset(struct
>>>>>> amdgpu_device *adev,
>>>>>> drm_sched_stop(&ring->sched, &job->base);
>>>>>> + /*
>>>>>> + * Must check guilty signal here since after this point
>>>>>> all old
>>>>>> + * HW fences are force signaled.
>>>>>> + *
>>>>>> + * job->base holds a reference to parent fence
>>>>>> + */
>>>>>> + if (job_signaled && job && ring == job_ring &&
>>>>>> + job->base.s_fence->parent &&
>>>>>> + dma_fence_is_signaled(job->base.s_fence->parent))
>>>>>> + *job_signaled = true;
>>>>>> +
>>>>> That won't work correctly. See when the guilty job is not on the
>>>>> first
>>>>> scheduler, you would already have force completed some before getting
>>>>> here.
>>>>>
>>>>> Better to stop all schedulers first and then do the check.
>>>>>
>>>>> Christian.
>>>> What do you mean by first scheduler ? There is one scheduler object
>>>> per
>>>> ring so I am not clear what 'first' means here.
>>> Well for example if the guilty job is from a compute ring the we have
>>> already force signaled the gfx ring here.
>>>
>>> Same is true for other devices in the same hive, so it would probably
>>> be a good idea to move the force signaling and the IP reset somewhere
>>> else and this check up a layer.
>>>
>>> Christian.
>>
>> Let me see if I understand you correctly - you want to AVOID ANY force
>> signaling in case we are not going to HW reset and so you want to have
>> the check if guilty is signaled BEFORE any ring fences are force
>> signaled. Correct ?
>
> Correct.
>
> Basically we should do the following:
> 1. Stop all schedulers to make sure that nothing is going on.
> 2. Check the guilty job once more to make sure that it hasn't signaled
> in the meantime.
> 3. Start our reset procedure, with force complete, soft reset
> eventually hard reset etc etc..
> 4. Resubmit all not yet completed jobs.
> 5. Start the schedulers again.
>
> Christian.
Why not just always ensure the guilty job's ring is always checked
first and then do the rest of the rings - inside
amdgpu_device_pre_asic_reset. Seems to me like a much smaller change
with less impact to current code structure.
Andrey
>
>>
>> Andrey
>>
>>>> Andrey
>>>>
>>>>
>>>>>> /* after all hw jobs are reset, hw fence is
>>>>>> meaningless, so
>>>>>> force_completion */
>>>>>> amdgpu_fence_driver_force_completion(ring);
>>>>>> }
>>>>>> @@ -3358,7 +3371,8 @@ static int amdgpu_device_pre_asic_reset(struct
>>>>>> amdgpu_device *adev,
>>>>>> - if (!amdgpu_sriov_vf(adev)) {
>>>>>> + /* Don't suspend on bare metal if we are not going to HW reset
>>>>>> the ASIC */
>>>>>> + if (!amdgpu_sriov_vf(adev) && !(*job_signaled)) {
>>>>>> if (!need_full_reset)
>>>>>> need_full_reset =
>>>>>> amdgpu_device_ip_need_full_reset(adev);
>>>>>> @@ -3495,7 +3509,7 @@ static int amdgpu_do_asic_reset(struct
>>>>>> amdgpu_hive_info *hive,
>>>>>> return r;
>>>>>> }
>>>>>> -static void amdgpu_device_post_asic_reset(struct amdgpu_device
>>>>>> *adev)
>>>>>> +static void amdgpu_device_post_asic_reset(struct amdgpu_device
>>>>>> *adev, bool job_signaled)
>>>>>> {
>>>>>> int i;
>>>>>> @@ -3505,7 +3519,8 @@ static void
>>>>>> amdgpu_device_post_asic_reset(struct amdgpu_device *adev)
>>>>>> if (!ring || !ring->sched.thread)
>>>>>> continue;
>>>>>> - if (!adev->asic_reset_res)
>>>>>> + /* No point to resubmit jobs if we didn't HW reset*/
>>>>>> + if (!adev->asic_reset_res && !job_signaled)
>>>>>> drm_sched_resubmit_jobs(&ring->sched);
>>>>>> drm_sched_start(&ring->sched, !adev->asic_reset_res);
>>>>>> @@ -3518,14 +3533,21 @@ static void
>>>>>> amdgpu_device_post_asic_reset(struct amdgpu_device *adev)
>>>>>> adev->asic_reset_res = 0;
>>>>>> }
>>>>>> -static void amdgpu_device_lock_adev(struct amdgpu_device *adev)
>>>>>> +static bool amdgpu_device_lock_adev(struct amdgpu_device *adev,
>>>>>> bool
>>>>>> trylock)
>>>>>> {
>>>>>> - mutex_lock(&adev->lock_reset);
>>>>>> + if (trylock) {
>>>>>> + if (!mutex_trylock(&adev->lock_reset))
>>>>>> + return false;
>>>>>> + } else
>>>>>> + mutex_lock(&adev->lock_reset);
>>>>>> +
>>>>>> atomic_inc(&adev->gpu_reset_counter);
>>>>>> adev->in_gpu_reset = 1;
>>>>>> /* Block kfd: SRIOV would do it separately */
>>>>>> if (!amdgpu_sriov_vf(adev))
>>>>>> amdgpu_amdkfd_pre_reset(adev);
>>>>>> +
>>>>>> + return true;
>>>>>> }
>>>>>> static void amdgpu_device_unlock_adev(struct amdgpu_device
>>>>>> *adev)
>>>>>> @@ -3555,29 +3577,44 @@ int amdgpu_device_gpu_recover(struct
>>>>>> amdgpu_device *adev,
>>>>>> {
>>>>>> int r;
>>>>>> struct amdgpu_hive_info *hive = NULL;
>>>>>> - bool need_full_reset = false;
>>>>>> struct amdgpu_device *tmp_adev = NULL;
>>>>>> struct list_head device_list, *device_list_handle = NULL;
>>>>>> + bool xgmi_topology_present, need_full_reset, job_signaled;
>>>>>> + need_full_reset = job_signaled = false;
>>>>>> INIT_LIST_HEAD(&device_list);
>>>>>> dev_info(adev->dev, "GPU reset begin!\n");
>>>>>> + hive = amdgpu_get_xgmi_hive(adev, 0);
>>>>>> + xgmi_topology_present = hive &&
>>>>>> adev->gmc.xgmi.num_physical_nodes > 1;
>>>>>> +
>>>>>> /*
>>>>>> - * In case of XGMI hive disallow concurrent resets to be
>>>>>> triggered
>>>>>> - * by different nodes. No point also since the one node already
>>>>>> executing
>>>>>> - * reset will also reset all the other nodes in the hive.
>>>>>> + * Here we trylock to avoid chain of resets executing from
>>>>>> + * either trigger by jobs on different adevs in XGMI hive or
>>>>>> jobs on
>>>>>> + * different schedulers for same device while this tdr is
>>>>>> running.
>>>>>> + * We always reset all schedulers for device and all devices
>>>>>> for
>>>>>> XGMI
>>>>>> + * hive so that should take care of them too.
>>>>>> */
>>>>>> - hive = amdgpu_get_xgmi_hive(adev, 0);
>>>>>> - if (hive && adev->gmc.xgmi.num_physical_nodes > 1 &&
>>>>>> - !mutex_trylock(&hive->reset_lock))
>>>>>> +
>>>>>> + if (xgmi_topology_present &&
>>>>>> !mutex_trylock(&hive->reset_lock)) {
>>>>>> + DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as
>>>>>> another already in progress",
>>>>>> + job->base.id, hive->hive_id);
>>>>>> return 0;
>>>>>> + }
>>>>>> /* Start with adev pre asic reset first for soft reset
>>>>>> check.*/
>>>>>> - amdgpu_device_lock_adev(adev);
>>>>>> + if (!amdgpu_device_lock_adev(adev, !xgmi_topology_present)) {
>>>>>> + DRM_INFO("Bailing on TDR for s_job:%llx, as another already
>>>>>> in progress",
>>>>>> + job->base.id);
>>>>>> + return 0;
>>>>>> + }
>>>>>> +
>>>>>> + /* Guilty job will be freed after this*/
>>>>>> r = amdgpu_device_pre_asic_reset(adev,
>>>>>> job,
>>>>>> - &need_full_reset);
>>>>>> + &need_full_reset,
>>>>>> + &job_signaled);
>>>>>> if (r) {
>>>>>> /*TODO Should we stop ?*/
>>>>>> DRM_ERROR("GPU pre asic reset failed with err, %d for
>>>>>> drm
>>>>>> dev, %s ",
>>>>>> @@ -3609,10 +3646,11 @@ int amdgpu_device_gpu_recover(struct
>>>>>> amdgpu_device *adev,
>>>>>> if (tmp_adev == adev)
>>>>>> continue;
>>>>>> - amdgpu_device_lock_adev(tmp_adev);
>>>>>> + amdgpu_device_lock_adev(tmp_adev, false);
>>>>>> r = amdgpu_device_pre_asic_reset(tmp_adev,
>>>>>> NULL,
>>>>>> - &need_full_reset);
>>>>>> + &need_full_reset,
>>>>>> + &job_signaled);
>>>>>> /*TODO Should we stop ?*/
>>>>>> if (r) {
>>>>>> DRM_ERROR("GPU pre asic reset failed with err, %d
>>>>>> for
>>>>>> drm dev, %s ",
>>>>>> @@ -3623,19 +3661,21 @@ int amdgpu_device_gpu_recover(struct
>>>>>> amdgpu_device *adev,
>>>>>> /* Actual ASIC resets if needed.*/
>>>>>> /* TODO Implement XGMI hive reset logic for SRIOV */
>>>>>> - if (amdgpu_sriov_vf(adev)) {
>>>>>> - r = amdgpu_device_reset_sriov(adev, job ? false : true);
>>>>>> - if (r)
>>>>>> - adev->asic_reset_res = r;
>>>>>> - } else {
>>>>>> - r = amdgpu_do_asic_reset(hive, device_list_handle,
>>>>>> &need_full_reset);
>>>>>> - if (r && r == -EAGAIN)
>>>>>> - goto retry;
>>>>>> + if (!job || !job_signaled) {
>>>>>> + if (amdgpu_sriov_vf(adev)) {
>>>>>> + r = amdgpu_device_reset_sriov(adev, job ? false :
>>>>>> true);
>>>>>> + if (r)
>>>>>> + adev->asic_reset_res = r;
>>>>>> + } else {
>>>>>> + r = amdgpu_do_asic_reset(hive, device_list_handle,
>>>>>> &need_full_reset);
>>>>>> + if (r && r == -EAGAIN)
>>>>>> + goto retry;
>>>>>> + }
>>>>>> }
>>>>>> /* Post ASIC reset for all devs .*/
>>>>>> list_for_each_entry(tmp_adev, device_list_handle,
>>>>>> gmc.xgmi.head) {
>>>>>> - amdgpu_device_post_asic_reset(tmp_adev);
>>>>>> + amdgpu_device_post_asic_reset(tmp_adev, job_signaled);
>>>>>> if (r) {
>>>>>> /* bad news, how to tell it to userspace ? */
>>>>>> @@ -3648,7 +3688,7 @@ int amdgpu_device_gpu_recover(struct
>>>>>> amdgpu_device *adev,
>>>>>> amdgpu_device_unlock_adev(tmp_adev);
>>>>>> }
>>>>>> - if (hive && adev->gmc.xgmi.num_physical_nodes > 1)
>>>>>> + if (xgmi_topology_present)
>>>>>> mutex_unlock(&hive->reset_lock);
>>>>>> if (r)
>>>> _______________________________________________
>>>> amd-gfx mailing list
>>>> amd-gfx at lists.freedesktop.org
>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx at lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>
More information about the amd-gfx
mailing list