[PATCH 1/7] amd/scheduler:imple job skip feature(v2)

Thu Oct 26 10:30:21 UTC 2017

jobs are skipped under two cases
1)when the entity behind this job marked guilty, the job
poped from this entity's queue will be dropped in sched_main loop.

2)in job_recovery(), skip the scheduling job if its karma detected
above limit, and also skipped as well for other jobs sharing the
same fence context. this approach is becuase job_recovery() cannot
access job->entity due to entity may already dead.

v2:
some logic fix

with this feature we can introduce new gpu recover feature.

Change-Id: I268b1c752c94e6ecd4ea78c87eb226ea3f52908a
Signed-off-by: Monk Liu <Monk.Liu at amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c       |  9 +++--
 drivers/gpu/drm/amd/scheduler/gpu_scheduler.c | 54 ++++++++++++++++++++-------
 drivers/gpu/drm/amd/scheduler/gpu_scheduler.h |  2 +-
 3 files changed, 47 insertions(+), 18 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index a58e3c5..f08fde9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -177,7 +177,7 @@ static struct dma_fence *amdgpu_job_dependency(struct amd_sched_job *sched_job)
 	return fence;
 }
 
-static struct dma_fence *amdgpu_job_run(struct amd_sched_job *sched_job)
+static struct dma_fence *amdgpu_job_run(struct amd_sched_job *sched_job, bool skip)
 {
 	struct dma_fence *fence = NULL;
 	struct amdgpu_device *adev;
@@ -194,10 +194,11 @@ static struct dma_fence *amdgpu_job_run(struct amd_sched_job *sched_job)
 	BUG_ON(amdgpu_sync_peek_fence(&job->sync, NULL));
 
 	trace_amdgpu_sched_run_job(job);
-	/* skip ib schedule when vram is lost */
-	if (job->vram_lost_counter != atomic_read(&adev->vram_lost_counter)) {
+
+	if (skip || job->vram_lost_counter != atomic_read(&adev->vram_lost_counter)) {
+		/* skip ib schedule if looks needed, and set error */
 		dma_fence_set_error(&job->base.s_fence->finished, -ECANCELED);
-		DRM_ERROR("Skip scheduling IBs!\n");
+		DRM_INFO("Skip scheduling IBs!\n");
 	} else {
 		r = amdgpu_ib_schedule(job->ring, job->num_ibs, job->ibs, job,
 				       &fence);
diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
index 9cbeade..995661e 100644
--- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
+++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
@@ -330,6 +330,28 @@ static bool amd_sched_entity_add_dependency_cb(struct amd_sched_entity *entity)
 	return false;
 }
 
+static void amd_sched_job_fake_signal(struct amd_sched_job *job)
+{
+	struct amd_sched_fence *s_fence = job->s_fence;
+	struct amd_gpu_scheduler *sched = job->sched;
+	struct amd_sched_entity *entity = job->s_entity;
+
+	dma_fence_set_error(&s_fence->finished, -ECANCELED);
+
+	/* fake signaling the scheduled fence */
+	atomic_inc(&sched->hw_rq_count);
+	amd_sched_fence_scheduled(s_fence);
+	wake_up(&sched->job_scheduled);
+
+	/* fake signaling the finished fence */
+	job->s_entity = NULL;
+	spsc_queue_pop(&entity->job_queue);
+
+	amd_sched_process_job(NULL, &s_fence->cb);
+	dma_fence_put(&s_fence->finished);
+	sched->ops->free_job(job);
+}
+
 static struct amd_sched_job *
 amd_sched_entity_pop_job(struct amd_sched_entity *entity)
 {
@@ -344,6 +366,11 @@ amd_sched_entity_pop_job(struct amd_sched_entity *entity)
 		if (amd_sched_entity_add_dependency_cb(entity))
 			return NULL;
 
+	if (entity->guilty && atomic_read(entity->guilty)) {
+		amd_sched_job_fake_signal(sched_job);
+		return NULL;
+	}
+
 	sched_job->s_entity = NULL;
 	spsc_queue_pop(&entity->job_queue);
 	return sched_job;
@@ -441,13 +468,6 @@ static void amd_sched_job_timedout(struct work_struct *work)
 	job->sched->ops->timedout_job(job);
 }
 
-static void amd_sched_set_guilty(struct amd_sched_job *s_job)
-{
-	if (atomic_inc_return(&s_job->karma) > s_job->sched->hang_limit)
-		if (s_job->s_entity->guilty)
-			atomic_set(s_job->s_entity->guilty, 1);
-}
-
 void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched, struct amd_sched_job *bad)
 {
 	struct amd_sched_job *s_job;
@@ -466,7 +486,7 @@ void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched, struct amd_sched_jo
 	}
 	spin_unlock(&sched->job_list_lock);
 
-	if (bad) {
+	if (bad && atomic_inc_return(&bad->karma) > bad->sched->hang_limit) {
 		bool found = false;
 
 		for (i = AMD_SCHED_PRIORITY_MIN; i < AMD_SCHED_PRIORITY_MAX; i++ ) {
@@ -476,7 +496,8 @@ void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched, struct amd_sched_jo
 			list_for_each_entry_safe(entity, tmp, &rq->entities, list) {
 				if (bad->s_fence->scheduled.context == entity->fence_context) {
 					found = true;
-					amd_sched_set_guilty(bad);
+					if (entity->guilty)
+						atomic_set(entity->guilty, 1);
 					break;
 				}
 			}
@@ -499,6 +520,7 @@ void amd_sched_job_kickout(struct amd_sched_job *s_job)
 void amd_sched_job_recovery(struct amd_gpu_scheduler *sched)
 {
 	struct amd_sched_job *s_job, *tmp;
+	bool found_guilty = false;
 	int r;
 
 	spin_lock(&sched->job_list_lock);
@@ -510,9 +532,17 @@ void amd_sched_job_recovery(struct amd_gpu_scheduler *sched)
 	list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list, node) {
 		struct amd_sched_fence *s_fence = s_job->s_fence;
 		struct dma_fence *fence;
+		bool skip;
+		uint64_t guilty_context;
+
+		if (!found_guilty && atomic_read(&s_job->karma) > sched->hang_limit) {
+			found_guilty = true;
+			guilty_context = s_job->s_fence->scheduled.context;
+		}
 
+		skip = (found_guilty && s_job->s_fence->scheduled.context == guilty_context);
 		spin_unlock(&sched->job_list_lock);
-		fence = sched->ops->run_job(s_job);
+		fence = sched->ops->run_job(s_job, skip);
 		atomic_inc(&sched->hw_rq_count);
 		if (fence) {
 			s_fence->parent = dma_fence_get(fence);
@@ -525,7 +555,6 @@ void amd_sched_job_recovery(struct amd_gpu_scheduler *sched)
 					  r);
 			dma_fence_put(fence);
 		} else {
-			DRM_ERROR("Failed to run job!\n");
 			amd_sched_process_job(NULL, &s_fence->cb);
 		}
 		spin_lock(&sched->job_list_lock);
@@ -650,7 +679,7 @@ static int amd_sched_main(void *param)
 		atomic_inc(&sched->hw_rq_count);
 		amd_sched_job_begin(sched_job);
 
-		fence = sched->ops->run_job(sched_job);
+		fence = sched->ops->run_job(sched_job, false);
 		amd_sched_fence_scheduled(s_fence);
 
 		if (fence) {
@@ -664,7 +693,6 @@ static int amd_sched_main(void *param)
 					  r);
 			dma_fence_put(fence);
 		} else {
-			DRM_ERROR("Failed to run job!\n");
 			amd_sched_process_job(NULL, &s_fence->cb);
 		}
 
diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
index f9e3a83..a8e5a7d 100644
--- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
+++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
@@ -126,7 +126,7 @@ static inline bool amd_sched_invalidate_job(struct amd_sched_job *s_job, int thr
 */
 struct amd_sched_backend_ops {
 	struct dma_fence *(*dependency)(struct amd_sched_job *sched_job);
-	struct dma_fence *(*run_job)(struct amd_sched_job *sched_job);
+	struct dma_fence *(*run_job)(struct amd_sched_job *sched_job, bool skip);
 	void (*timedout_job)(struct amd_sched_job *sched_job);
 	void (*free_job)(struct amd_sched_job *sched_job);
 };
-- 
2.7.4