[PATCH 1/4] drm/amdgpu: stop all rings before doing gpu recover

Wed Feb 28 13:30:50 UTC 2018

Will do once Monk sends V2 for  [PATCH 4/4] drm/amdgpu: try again kiq 
access if not in IRQ

Andrey


On 02/28/2018 07:20 AM, Christian König wrote:
> Andrey please give this set a good testing as well.
>
> Am 28.02.2018 um 08:21 schrieb Monk Liu:
>> found recover_vram_from_shadow sometimes get executed
>> in paralle with SDMA scheduler, should stop all
>> schedulers before doing gpu reset/recover
>>
>> Change-Id: Ibaef3e3c015f3cf88f84b2eaf95cda95ae1a64e3
>> Signed-off-by: Monk Liu <Monk.Liu at amd.com>
>
> For now this patch is Reviewed-by: Christian König 
> <christian.koenig at amd.com>.
>
> Regards,
> Christian.
>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 40 
>> +++++++++++-------------------
>>   1 file changed, 15 insertions(+), 25 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index 75d1733..e9d81a8 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -2649,22 +2649,23 @@ int amdgpu_device_gpu_recover(struct 
>> amdgpu_device *adev,
>>         /* block TTM */
>>       resched = ttm_bo_lock_delayed_workqueue(&adev->mman.bdev);
>> +
>>       /* store modesetting */
>>       if (amdgpu_device_has_dc_support(adev))
>>           state = drm_atomic_helper_suspend(adev->ddev);
>>   -    /* block scheduler */
>> +    /* block all schedulers and reset given job's ring */
>>       for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>>           struct amdgpu_ring *ring = adev->rings[i];
>>             if (!ring || !ring->sched.thread)
>>               continue;
>>   -        /* only focus on the ring hit timeout if &job not NULL */
>> +        kthread_park(ring->sched.thread);
>> +
>>           if (job && job->ring->idx != i)
>>               continue;
>>   -        kthread_park(ring->sched.thread);
>>           drm_sched_hw_job_reset(&ring->sched, &job->base);
>>             /* after all hw jobs are reset, hw fence is meaningless, 
>> so force_completion */
>> @@ -2707,33 +2708,22 @@ int amdgpu_device_gpu_recover(struct 
>> amdgpu_device *adev,
>>               }
>>               dma_fence_put(fence);
>>           }
>> +    }
>>   -        for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>> -            struct amdgpu_ring *ring = adev->rings[i];
>> -
>> -            if (!ring || !ring->sched.thread)
>> -                continue;
>> +    for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>> +        struct amdgpu_ring *ring = adev->rings[i];
>>   -            /* only focus on the ring hit timeout if &job not NULL */
>> -            if (job && job->ring->idx != i)
>> -                continue;
>> +        if (!ring || !ring->sched.thread)
>> +            continue;
>>   +        /* only need recovery sched of the given job's ring
>> +         * or all rings (in the case @job is NULL)
>> +         * after above amdgpu_reset accomplished
>> +         */
>> +        if ((!job || job->ring->idx == i) && !r)
>>               drm_sched_job_recovery(&ring->sched);
>> -            kthread_unpark(ring->sched.thread);
>> -        }
>> -    } else {
>> -        for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>> -            struct amdgpu_ring *ring = adev->rings[i];
>>   -            if (!ring || !ring->sched.thread)
>> -                continue;
>> -
>> -            /* only focus on the ring hit timeout if &job not NULL */
>> -            if (job && job->ring->idx != i)
>> -                continue;
>> -
>> -            kthread_unpark(adev->rings[i]->sched.thread);
>> -        }
>> +        kthread_unpark(ring->sched.thread);
>>       }
>>         if (amdgpu_device_has_dc_support(adev)) {
>