[PATCH v2 09/12] drm/panfrost: Don't reset the GPU on job faults unless we really have to

Mon Jun 21 13:39:04 UTC 2021

If we can recover from a fault without a reset there's no reason to
issue one.

Signed-off-by: Boris Brezillon <boris.brezillon at collabora.com>
---
 drivers/gpu/drm/panfrost/panfrost_device.c |  9 ++++++
 drivers/gpu/drm/panfrost/panfrost_device.h |  2 ++
 drivers/gpu/drm/panfrost/panfrost_job.c    | 35 ++++++++++++++--------
 3 files changed, 34 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/panfrost/panfrost_device.c b/drivers/gpu/drm/panfrost/panfrost_device.c
index 2de011cee258..ac76e8646e97 100644
--- a/drivers/gpu/drm/panfrost/panfrost_device.c
+++ b/drivers/gpu/drm/panfrost/panfrost_device.c
@@ -383,6 +383,15 @@ int panfrost_exception_to_error(u32 exception_code)
 	return panfrost_exception_infos[exception_code].error;
 }
 
+bool panfrost_exception_needs_reset(const struct panfrost_device *pfdev,
+				    u32 exception_code)
+{
+	/* Right now, none of the GPU we support need a reset, but this
+	 * might change (e.g. Valhall GPUs require a when a BUS_FAULT occurs).
+	 */
+	return false;
+}
+
 void panfrost_device_reset(struct panfrost_device *pfdev)
 {
 	panfrost_gpu_soft_reset(pfdev);
diff --git a/drivers/gpu/drm/panfrost/panfrost_device.h b/drivers/gpu/drm/panfrost/panfrost_device.h
index 498c7b5dccd0..95e6044008d2 100644
--- a/drivers/gpu/drm/panfrost/panfrost_device.h
+++ b/drivers/gpu/drm/panfrost/panfrost_device.h
@@ -175,6 +175,8 @@ int panfrost_device_suspend(struct device *dev);
 
 const char *panfrost_exception_name(u32 exception_code);
 int panfrost_exception_to_error(u32 exception_code);
+bool panfrost_exception_needs_reset(const struct panfrost_device *pfdev,
+				    u32 exception_code);
 
 static inline void
 panfrost_device_schedule_reset(struct panfrost_device *pfdev)
diff --git a/drivers/gpu/drm/panfrost/panfrost_job.c b/drivers/gpu/drm/panfrost/panfrost_job.c
index be5d3e4a1d0a..aedc604d331c 100644
--- a/drivers/gpu/drm/panfrost/panfrost_job.c
+++ b/drivers/gpu/drm/panfrost/panfrost_job.c
@@ -493,27 +493,38 @@ static irqreturn_t panfrost_job_irq_handler(int irq, void *data)
 
 		if (status & JOB_INT_MASK_ERR(j)) {
 			enum panfrost_queue_status old_status;
+			u32 js_status = job_read(pfdev, JS_STATUS(j));
 
 			job_write(pfdev, JS_COMMAND_NEXT(j), JS_COMMAND_NOP);
 
 			dev_err(pfdev->dev, "js fault, js=%d, status=%s, head=0x%x, tail=0x%x",
 				j,
-				panfrost_exception_name(job_read(pfdev, JS_STATUS(j))),
+				panfrost_exception_name(js_status),
 				job_read(pfdev, JS_HEAD_LO(j)),
 				job_read(pfdev, JS_TAIL_LO(j)));
 
-			/*
-			 * When the queue is being restarted we don't report
-			 * faults directly to avoid races between the timeout
-			 * and reset handlers. panfrost_scheduler_start() will
-			 * call drm_sched_fault() after the queue has been
-			 * started if status == FAULT_PENDING.
+			/* If we need a reset, signal it to the reset handler,
+			 * otherwise, update the fence error field and signal
+			 * the job fence.
 			 */
-			old_status = atomic_cmpxchg(&pfdev->js->queue[j].status,
-						    PANFROST_QUEUE_STATUS_STARTING,
-						    PANFROST_QUEUE_STATUS_FAULT_PENDING);
-			if (old_status == PANFROST_QUEUE_STATUS_ACTIVE)
-				drm_sched_fault(&pfdev->js->queue[j].sched);
+			if (panfrost_exception_needs_reset(pfdev, js_status)) {
+				/*
+				 * When the queue is being restarted we don't report
+				 * faults directly to avoid races between the timeout
+				 * and reset handlers. panfrost_scheduler_start() will
+				 * call drm_sched_fault() after the queue has been
+				 * started if status == FAULT_PENDING.
+				 */
+				old_status = atomic_cmpxchg(&pfdev->js->queue[j].status,
+							    PANFROST_QUEUE_STATUS_STARTING,
+							    PANFROST_QUEUE_STATUS_FAULT_PENDING);
+				if (old_status == PANFROST_QUEUE_STATUS_ACTIVE)
+					drm_sched_fault(&pfdev->js->queue[j].sched);
+			} else {
+				dma_fence_set_error(pfdev->jobs[j]->done_fence,
+						    panfrost_exception_to_error(js_status));
+				status |= JOB_INT_MASK_DONE(j);
+			}
 		}
 
 		if (status & JOB_INT_MASK_DONE(j)) {
-- 
2.31.1