[PATCH 5/5] drm/amdgpu:sriov TDR only recover hang ring
Monk Liu
Monk.Liu at amd.com
Mon May 1 07:22:51 UTC 2017
instead of reset/recovery all rings, we can only work
on the perticular ring if detects it hang.
Change-Id: Ie9de78819e1567e9f001d3593c9c52f749137c32
Signed-off-by: Monk Liu <Monk.Liu at amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 35 ++++++++++++++++++++++++------
drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 6 +++++
drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h | 1 +
3 files changed, 35 insertions(+), 7 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 157d023..4dbd121 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2551,19 +2551,26 @@ int amdgpu_sriov_gpu_reset(struct amdgpu_device *adev, struct amdgpu_job *job, b
/* block TTM */
resched = ttm_bo_lock_delayed_workqueue(&adev->mman.bdev);
- /* block scheduler */
- for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
- ring = adev->rings[i];
+ /* we start from the ring trigger GPU hang */
+ j = job ? job->ring->idx : 0;
+ /* block scheduler */
+ for (i = j; i < j + AMDGPU_MAX_RINGS; ++i) {
+ ring = adev->rings[i % AMDGPU_MAX_RINGS];
if (!ring || !ring->sched.thread)
continue;
kthread_park(ring->sched.thread);
+
+ if (job && job->ring->idx != i)
+ continue;
+
+ /* only do job_reset on the hang ring if @job not NULL */
amd_sched_hw_job_reset(&ring->sched);
- }
- /* after all hw jobs are reset, hw fence is meaningless, so force_completion */
- amdgpu_fence_driver_force_completion(adev);
+ /* after all hw jobs are reset, hw fence is meaningless, so force_completion */
+ amdgpu_fence_driver_force_completion_ring(ring);
+ }
/* request to take full control of GPU before re-initialization */
if (voluntary)
@@ -2615,12 +2622,26 @@ int amdgpu_sriov_gpu_reset(struct amdgpu_device *adev, struct amdgpu_job *job, b
}
fence_put(fence);
+ /* before recovery and unpark, kickout guilty for every rings */
for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
- struct amdgpu_ring *ring = adev->rings[i];
+ ring = adev->rings[i];
+
if (!ring || !ring->sched.thread)
continue;
amd_sched_job_kickout_guilty(&ring->sched);
+ }
+
+ for (i = j; i < j + AMDGPU_MAX_RINGS; ++i) {
+ ring = adev->rings[i % AMDGPU_MAX_RINGS];
+ if (!ring || !ring->sched.thread)
+ continue;
+
+ if (job && job->ring->idx != i) {
+ kthread_unpark(ring->sched.thread);
+ continue;
+ }
+
amd_sched_job_recovery(&ring->sched);
kthread_unpark(ring->sched.thread);
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
index 5772ef2..de4c851 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
@@ -541,6 +541,12 @@ void amdgpu_fence_driver_force_completion(struct amdgpu_device *adev)
}
}
+void amdgpu_fence_driver_force_completion_ring(struct amdgpu_ring *ring)
+{
+ if (ring)
+ amdgpu_fence_write(ring, ring->fence_drv.sync_seq);
+}
+
/*
* Common fence implementation
*/
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
index 5786cc3..2acaac6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -76,6 +76,7 @@ struct amdgpu_fence_driver {
int amdgpu_fence_driver_init(struct amdgpu_device *adev);
void amdgpu_fence_driver_fini(struct amdgpu_device *adev);
void amdgpu_fence_driver_force_completion(struct amdgpu_device *adev);
+void amdgpu_fence_driver_force_completion_ring(struct amdgpu_ring *ring);
int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring,
unsigned num_hw_submission);
--
2.7.4
More information about the amd-gfx
mailing list