[PATCH 07/17] drm/msm: Add A5XX hardware fault detection
Jordan Crouse
jcrouse at codeaurora.org
Thu Jul 27 16:42:36 UTC 2017
The A5XX GPU has really good hardware fault detection that can
detect a abnormal hardware condition and fire an interrupt in
a matter of milliseconds which is a lot better than waiting for
the hangcheck timer.
Enable the interrupt and log information before kicking off
recovery.
Signed-off-by: Jordan Crouse <jcrouse at codeaurora.org>
---
drivers/gpu/drm/msm/adreno/a5xx_gpu.c | 26 ++++++++++++++++++++++++++
1 file changed, 26 insertions(+)
diff --git a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
index 3af29cae..6361193 100644
--- a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
@@ -438,6 +438,7 @@ static int a5xx_zap_shader_init(struct msm_gpu *gpu)
A5XX_RBBM_INT_0_MASK_RBBM_ETS_MS_TIMEOUT | \
A5XX_RBBM_INT_0_MASK_RBBM_ATB_ASYNC_OVERFLOW | \
A5XX_RBBM_INT_0_MASK_CP_HW_ERROR | \
+ A5XX_RBBM_INT_0_MASK_MISC_HANG_DETECT | \
A5XX_RBBM_INT_0_MASK_CP_CACHE_FLUSH_TS | \
A5XX_RBBM_INT_0_MASK_UCHE_OOB_ACCESS | \
A5XX_RBBM_INT_0_MASK_GPMU_VOLTAGE_DROOP)
@@ -843,6 +844,28 @@ static void a5xx_gpmu_err_irq(struct msm_gpu *gpu)
dev_err_ratelimited(gpu->dev->dev, "GPMU | voltage droop\n");
}
+static void a5xx_fault_detect_irq(struct msm_gpu *gpu)
+{
+ struct drm_device *dev = gpu->dev;
+ struct msm_drm_private *priv = dev->dev_private;
+ struct msm_ringbuffer *ring = gpu->funcs->active_ring(gpu);
+
+ dev_err(dev->dev, "gpu fault ring %d fence %x status %8.8X rb %4.4x/%4.4x ib1 %16.16llX/%4.4x ib2 %16.16llX/%4.4x\n",
+ ring ? ring->id : -1, ring ? ring->seqno : 0,
+ gpu_read(gpu, REG_A5XX_RBBM_STATUS),
+ gpu_read(gpu, REG_A5XX_CP_RB_RPTR),
+ gpu_read(gpu, REG_A5XX_CP_RB_WPTR),
+ gpu_read64(gpu, REG_A5XX_CP_IB1_BASE, REG_A5XX_CP_IB1_BASE_HI),
+ gpu_read(gpu, REG_A5XX_CP_IB1_BUFSZ),
+ gpu_read64(gpu, REG_A5XX_CP_IB2_BASE, REG_A5XX_CP_IB2_BASE_HI),
+ gpu_read(gpu, REG_A5XX_CP_IB2_BUFSZ));
+
+ /* Turn off the hangcheck timer to keep it from bothering us */
+ del_timer(&gpu->hangcheck_timer);
+
+ queue_work(priv->wq, &gpu->recover_work);
+}
+
#define RBBM_ERROR_MASK \
(A5XX_RBBM_INT_0_MASK_RBBM_AHB_ERROR | \
A5XX_RBBM_INT_0_MASK_RBBM_TRANSFER_TIMEOUT | \
@@ -869,6 +892,9 @@ static irqreturn_t a5xx_irq(struct msm_gpu *gpu)
if (status & A5XX_RBBM_INT_0_MASK_CP_HW_ERROR)
a5xx_cp_err_irq(gpu);
+ if (status & A5XX_RBBM_INT_0_MASK_MISC_HANG_DETECT)
+ a5xx_fault_detect_irq(gpu);
+
if (status & A5XX_RBBM_INT_0_MASK_UCHE_OOB_ACCESS)
a5xx_uche_err_irq(gpu);
--
1.9.1
More information about the dri-devel
mailing list