[PATCH 3/6] drm/xe: Add extra busted protection for the no GuC reset
Rodrigo Vivi
rodrigo.vivi at intel.com
Fri Mar 15 14:01:05 UTC 2024
When GuC doesn't reset the GPU on our behalf we need to be
extra cautious on timeout and skip scheduling jobs or
manually forcing gt_reset. Otherwise we get in infinite loop
of timeout and reschedule.
So, this is a preparation for introducing the busted mode
where it gets busted in any single timeout/hang without
allowing GuC to reset.
XXX: This is enough to get a clean stop for the software
validation teams to debug the memory. However the device
unbind will splat some WARNS because memory is not entirely
free since hw_fences were not released.
Cc: Matthew Brost <matthew.brost at intel.com>
Signed-off-by: Rodrigo Vivi <rodrigo.vivi at intel.com>
---
drivers/gpu/drm/xe/xe_guc_submit.c | 17 +++++++++++------
1 file changed, 11 insertions(+), 6 deletions(-)
diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c
index 82c955a2a15c..ee663683e9eb 100644
--- a/drivers/gpu/drm/xe/xe_guc_submit.c
+++ b/drivers/gpu/drm/xe/xe_guc_submit.c
@@ -176,7 +176,8 @@ static void set_exec_queue_killed(struct xe_exec_queue *q)
static bool exec_queue_killed_or_banned(struct xe_exec_queue *q)
{
- return exec_queue_killed(q) || exec_queue_banned(q);
+ return xe_device_busted(gt_to_xe(q->gt)) ||
+ exec_queue_killed(q) || exec_queue_banned(q);
}
#ifdef CONFIG_PROVE_LOCKING
@@ -960,7 +961,7 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
*/
if (q->flags & EXEC_QUEUE_FLAG_KERNEL ||
(q->flags & EXEC_QUEUE_FLAG_VM && !exec_queue_killed(q))) {
- if (!xe_sched_invalidate_job(job, 2)) {
+ if (!xe_sched_invalidate_job(job, 2) && !xe_device_busted(xe)) {
xe_sched_add_pending_job(sched, job);
xe_sched_submission_start(sched);
xe_gt_reset_async(q->gt);
@@ -969,7 +970,7 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
}
/* Engine state now stable, disable scheduling if needed */
- if (exec_queue_registered(q)) {
+ if (exec_queue_registered(q) && !xe_device_busted(xe)) {
struct xe_guc *guc = exec_queue_to_guc(q);
int ret;
@@ -1010,8 +1011,11 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
* Fence state now stable, stop / start scheduler which cleans up any
* fences that are complete
*/
- xe_sched_add_pending_job(sched, job);
- xe_sched_submission_start(sched);
+ if (!xe_device_busted(xe)) {
+ xe_sched_add_pending_job(sched, job);
+ xe_sched_submission_start(sched);
+ }
+
xe_guc_exec_queue_trigger_cleanup(q);
/* Mark all outstanding jobs as bad, thus completing them */
@@ -1024,7 +1028,8 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
xe_hw_fence_irq_start(q->fence_irq);
out:
- return DRM_GPU_SCHED_STAT_NOMINAL;
+ return xe_device_busted(xe) ? DRM_GPU_SCHED_STAT_ENODEV :
+ DRM_GPU_SCHED_STAT_NOMINAL;
}
static void __guc_exec_queue_fini_async(struct work_struct *w)
--
2.44.0
More information about the Intel-xe
mailing list