[PATCH 6/7] drm/amdgpu: recovery hw ring when gpu reset
zhoucm1
david1.zhou at amd.com
Wed Jun 29 09:08:31 UTC 2016
On 2016年06月29日 17:03, Christian König wrote:
> Am 29.06.2016 um 10:09 schrieb Chunming Zhou:
>> Change-Id: I8e554d34c9e477ea255e0ed2a936397aa5f665e7
>> Signed-off-by: Chunming Zhou <David1.Zhou at amd.com>
>> ---
>> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 +
>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 5 +++--
>> drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 4 ++--
>> drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c | 23
>> ++++++++++++++---------
>> drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 22 ++++++++++++++++++++++
>> 5 files changed, 42 insertions(+), 13 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> index 163429c8..03b0fe7 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> @@ -763,6 +763,7 @@ void amdgpu_job_free(struct amdgpu_job *job);
>> int amdgpu_job_submit(struct amdgpu_job *job, struct amdgpu_ring
>> *ring,
>> struct amd_sched_entity *entity, void *owner,
>> struct fence **f);
>> +void amdgpu_job_recovery(struct amd_gpu_scheduler *sched);
>> struct amdgpu_ring {
>> struct amdgpu_device *adev;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index 5c4691c..2c8e7f4 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -1994,13 +1994,14 @@ retry:
>> }
>> /* restore scratch */
>> amdgpu_atombios_scratch_regs_restore(adev);
>> - if (0) {
>> + if (!r) {
>> for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>> struct amdgpu_ring *ring = adev->rings[i];
>> if (!ring)
>> continue;
>> + amdgpu_job_recovery(&ring->sched);
>> kthread_unpark(ring->sched.thread);
>> - amdgpu_ring_restore(ring, ring_sizes[i], ring_data[i]);
>> + kfree(ring_data[i]);
>> ring_sizes[i] = 0;
>> ring_data[i] = NULL;
>> }
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>> index 72bf9f8..8af9903 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>> @@ -366,9 +366,9 @@ int amdgpu_fence_driver_init_ring(struct
>> amdgpu_ring *ring,
>> setup_timer(&ring->fence_drv.fallback_timer,
>> amdgpu_fence_fallback,
>> (unsigned long)ring);
>> - ring->fence_drv.num_fences_mask = num_hw_submission * 2 - 1;
>> + ring->fence_drv.num_fences_mask = num_hw_submission * 4 - 1;
>> spin_lock_init(&ring->fence_drv.lock);
>> - ring->fence_drv.fences = kcalloc(num_hw_submission * 2,
>> sizeof(void *),
>> + ring->fence_drv.fences = kcalloc(num_hw_submission * 4,
>> sizeof(void *),
>> GFP_KERNEL);
>> if (!ring->fence_drv.fences)
>> return -ENOMEM;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>> index 34e3542..702bd9b 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>> @@ -193,14 +193,19 @@ int amdgpu_ib_schedule(struct amdgpu_ring
>> *ring, unsigned num_ibs,
>> if (ring->funcs->emit_hdp_invalidate)
>> amdgpu_ring_emit_hdp_invalidate(ring);
>> - r = amdgpu_fence_emit(ring, &hwf);
>> - if (r) {
>> - dev_err(adev->dev, "failed to emit fence (%d)\n", r);
>> - if (job && job->vm_id)
>> - amdgpu_vm_reset_id(adev, job->vm_id);
>> - amdgpu_ring_undo(ring);
>> - return r;
>> - }
>> + if (!job || !job->fence) {
>> + r = amdgpu_fence_emit(ring, &hwf);
>> + if (r) {
>> + dev_err(adev->dev, "failed to emit fence (%d)\n", r);
>> + if (job && job->vm_id)
>> + amdgpu_vm_reset_id(adev, job->vm_id);
>> + amdgpu_ring_undo(ring);
>> + return r;
>> + }
>> + } else
>> + /* re-submit fence when gpu reset */
>> + amdgpu_ring_emit_fence(ring, ring->fence_drv.gpu_addr,
>> + job->fence->seqno, AMDGPU_FENCE_FLAG_INT);
>
> You shouldn't resubmit the old hardware fence here, but rather
> allocate a new fence number.
This will introduce new problem and let things complicate, E.g. old hw
fence f1, f2 in hang engine, that means sched_f1, sched_f2 is callback
from f1,f2, if allocating new fence number when re-submitting them, will
have new f3,f4. When f3 is signalled, then that means f1,f2,
sched_f1,sched_f2 are signalled, then userspace think cs of sched_f2 is
completed, but not in fact, which is f4,sched_f4.
I don't know why we shouldn't/couldn't the old hw fence.
>
>> /* wrap the last IB with fence */
>> if (job && job->uf_bo) {
>> @@ -212,7 +217,7 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring,
>> unsigned num_ibs,
>> }
>> if (f)
>> - *f = fence_get(hwf);
>> + *f = (job && job->fence) ? job->fence : fence_get(hwf);
>> if (patch_offset != ~0 && ring->funcs->patch_cond_exec)
>> amdgpu_ring_patch_cond_exec(ring, patch_offset);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>> index 83771c1..32fad1c 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>> @@ -39,6 +39,28 @@ static void amdgpu_job_timedout(struct
>> amd_sched_job *s_job)
>> amdgpu_gpu_reset(job->adev);
>> }
>> +void amdgpu_job_recovery(struct amd_gpu_scheduler *sched)
>> +{
>> + struct amd_sched_job *s_job, *tmp;
>> +
>> + spin_lock(&sched->job_list_lock);
>> + list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list,
>> node) {
>> + struct amdgpu_job *job = to_amdgpu_job(s_job);
>> + if (job->vm) {
>> + struct amdgpu_vm_id *id =
>> &job->adev->vm_manager.ids[job->vm_id];
>> + job->vm_pd_addr =
>> amdgpu_bo_gpu_offset(job->vm->page_directory);
>> + id->pd_gpu_addr = job->vm_pd_addr;
>
> That is clearly not a good idea, the page directory could be moving
> somewhere else when this is happening already.
Hmm, how about directly to use id->pd_gpu_addr? since pd BO must be
there, id->pd_gpu_addr is saved that.
>
>> + }
>> + sched->ops->run_job(s_job);
>> + }
>> + s_job = list_first_entry_or_null(&sched->ring_mirror_list,
>> + struct amd_sched_job, node);
>> + if (s_job)
>> + schedule_delayed_work(&s_job->work_tdr, sched->timeout);
>> +
>> + spin_unlock(&sched->job_list_lock);
>> +}
>
> The whole function belongs into the scheduler.
if vmid is processed, this indeed be sched function.
Thanks,
David Zhou
>
> Christian.
>
>> +
>> int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs,
>> struct amdgpu_job **job, struct amdgpu_vm *vm)
>> {
>
More information about the amd-gfx
mailing list