[PATCH 6/7] drm/amdgpu: recovery hw ring when gpu reset
Christian König
deathsimple at vodafone.de
Wed Jun 29 09:03:14 UTC 2016
Am 29.06.2016 um 10:09 schrieb Chunming Zhou:
> Change-Id: I8e554d34c9e477ea255e0ed2a936397aa5f665e7
> Signed-off-by: Chunming Zhou <David1.Zhou at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 +
> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 5 +++--
> drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 4 ++--
> drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c | 23 ++++++++++++++---------
> drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 22 ++++++++++++++++++++++
> 5 files changed, 42 insertions(+), 13 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 163429c8..03b0fe7 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -763,6 +763,7 @@ void amdgpu_job_free(struct amdgpu_job *job);
> int amdgpu_job_submit(struct amdgpu_job *job, struct amdgpu_ring *ring,
> struct amd_sched_entity *entity, void *owner,
> struct fence **f);
> +void amdgpu_job_recovery(struct amd_gpu_scheduler *sched);
>
> struct amdgpu_ring {
> struct amdgpu_device *adev;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 5c4691c..2c8e7f4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -1994,13 +1994,14 @@ retry:
> }
> /* restore scratch */
> amdgpu_atombios_scratch_regs_restore(adev);
> - if (0) {
> + if (!r) {
> for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
> struct amdgpu_ring *ring = adev->rings[i];
> if (!ring)
> continue;
> + amdgpu_job_recovery(&ring->sched);
> kthread_unpark(ring->sched.thread);
> - amdgpu_ring_restore(ring, ring_sizes[i], ring_data[i]);
> + kfree(ring_data[i]);
> ring_sizes[i] = 0;
> ring_data[i] = NULL;
> }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> index 72bf9f8..8af9903 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> @@ -366,9 +366,9 @@ int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring,
> setup_timer(&ring->fence_drv.fallback_timer, amdgpu_fence_fallback,
> (unsigned long)ring);
>
> - ring->fence_drv.num_fences_mask = num_hw_submission * 2 - 1;
> + ring->fence_drv.num_fences_mask = num_hw_submission * 4 - 1;
> spin_lock_init(&ring->fence_drv.lock);
> - ring->fence_drv.fences = kcalloc(num_hw_submission * 2, sizeof(void *),
> + ring->fence_drv.fences = kcalloc(num_hw_submission * 4, sizeof(void *),
> GFP_KERNEL);
> if (!ring->fence_drv.fences)
> return -ENOMEM;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
> index 34e3542..702bd9b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
> @@ -193,14 +193,19 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
> if (ring->funcs->emit_hdp_invalidate)
> amdgpu_ring_emit_hdp_invalidate(ring);
>
> - r = amdgpu_fence_emit(ring, &hwf);
> - if (r) {
> - dev_err(adev->dev, "failed to emit fence (%d)\n", r);
> - if (job && job->vm_id)
> - amdgpu_vm_reset_id(adev, job->vm_id);
> - amdgpu_ring_undo(ring);
> - return r;
> - }
> + if (!job || !job->fence) {
> + r = amdgpu_fence_emit(ring, &hwf);
> + if (r) {
> + dev_err(adev->dev, "failed to emit fence (%d)\n", r);
> + if (job && job->vm_id)
> + amdgpu_vm_reset_id(adev, job->vm_id);
> + amdgpu_ring_undo(ring);
> + return r;
> + }
> + } else
> + /* re-submit fence when gpu reset */
> + amdgpu_ring_emit_fence(ring, ring->fence_drv.gpu_addr,
> + job->fence->seqno, AMDGPU_FENCE_FLAG_INT);
You shouldn't resubmit the old hardware fence here, but rather allocate
a new fence number.
>
> /* wrap the last IB with fence */
> if (job && job->uf_bo) {
> @@ -212,7 +217,7 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
> }
>
> if (f)
> - *f = fence_get(hwf);
> + *f = (job && job->fence) ? job->fence : fence_get(hwf);
>
> if (patch_offset != ~0 && ring->funcs->patch_cond_exec)
> amdgpu_ring_patch_cond_exec(ring, patch_offset);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> index 83771c1..32fad1c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> @@ -39,6 +39,28 @@ static void amdgpu_job_timedout(struct amd_sched_job *s_job)
> amdgpu_gpu_reset(job->adev);
> }
>
> +void amdgpu_job_recovery(struct amd_gpu_scheduler *sched)
> +{
> + struct amd_sched_job *s_job, *tmp;
> +
> + spin_lock(&sched->job_list_lock);
> + list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list, node) {
> + struct amdgpu_job *job = to_amdgpu_job(s_job);
> + if (job->vm) {
> + struct amdgpu_vm_id *id = &job->adev->vm_manager.ids[job->vm_id];
> + job->vm_pd_addr = amdgpu_bo_gpu_offset(job->vm->page_directory);
> + id->pd_gpu_addr = job->vm_pd_addr;
That is clearly not a good idea, the page directory could be moving
somewhere else when this is happening already.
> + }
> + sched->ops->run_job(s_job);
> + }
> + s_job = list_first_entry_or_null(&sched->ring_mirror_list,
> + struct amd_sched_job, node);
> + if (s_job)
> + schedule_delayed_work(&s_job->work_tdr, sched->timeout);
> +
> + spin_unlock(&sched->job_list_lock);
> +}
The whole function belongs into the scheduler.
Christian.
> +
> int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs,
> struct amdgpu_job **job, struct amdgpu_vm *vm)
> {
More information about the amd-gfx
mailing list