[PATCH] drm/msm: Wait for MMU devcoredump when waiting for GMU
Rob Clark
rob.clark at oss.qualcomm.com
Mon Jul 21 15:32:13 UTC 2025
On Fri, Jul 18, 2025 at 6:50 AM Connor Abbott <cwabbott0 at gmail.com> wrote:
>
> If there is a flood of faults then the MMU can become saturated while it
> waits for the kernel to process the first fault and resume it, so that
> the GMU becomes blocked. This is mainly a problem when the kernel reads
> the state of the GPU for a devcoredump, because this takes a while. If
> we timeout waiting for the GMU, check if this has happened and retry
> after we're finished.
>
> Signed-off-by: Connor Abbott <cwabbott0 at gmail.com>
> ---
> drivers/gpu/drm/msm/adreno/a6xx_gmu.c | 21 ++++++++++++++++++---
> drivers/gpu/drm/msm/adreno/a6xx_hfi.c | 21 ++++++++++++++++++---
> drivers/gpu/drm/msm/adreno/adreno_gpu.c | 11 +++++++++++
> drivers/gpu/drm/msm/adreno/adreno_gpu.h | 2 ++
> 4 files changed, 49 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gmu.c b/drivers/gpu/drm/msm/adreno/a6xx_gmu.c
> index 28e6705c6da682c7b41c748e375dda59a6551898..6ec396fab22d194481a76d30b2d36ea5fb662241 100644
> --- a/drivers/gpu/drm/msm/adreno/a6xx_gmu.c
> +++ b/drivers/gpu/drm/msm/adreno/a6xx_gmu.c
> @@ -340,6 +340,7 @@ int a6xx_gmu_set_oob(struct a6xx_gmu *gmu, enum a6xx_gmu_oob_state state)
> int ret;
> u32 val;
> int request, ack;
> + struct a6xx_gpu *a6xx_gpu = container_of(gmu, struct a6xx_gpu, gmu);
>
> WARN_ON_ONCE(!mutex_is_locked(&gmu->lock));
>
> @@ -363,9 +364,23 @@ int a6xx_gmu_set_oob(struct a6xx_gmu *gmu, enum a6xx_gmu_oob_state state)
> /* Trigger the equested OOB operation */
> gmu_write(gmu, REG_A6XX_GMU_HOST2GMU_INTR_SET, 1 << request);
>
> - /* Wait for the acknowledge interrupt */
> - ret = gmu_poll_timeout(gmu, REG_A6XX_GMU_GMU2HOST_INTR_INFO, val,
> - val & (1 << ack), 100, 10000);
> + do {
> + /* Wait for the acknowledge interrupt */
> + ret = gmu_poll_timeout(gmu, REG_A6XX_GMU_GMU2HOST_INTR_INFO, val,
> + val & (1 << ack), 100, 10000);
> +
> + if (!ret)
> + break;
> +
> + if (completion_done(&a6xx_gpu->base.fault_coredump_done))
> + break;
> +
> + /* We may timeout because the GMU is temporarily wedged from
> + * pending faults from the GPU and we are taking a devcoredump.
> + * Wait until the MMU is resumed and try again.
> + */
> + wait_for_completion(&a6xx_gpu->base.fault_coredump_done);
> + } while (true);
It is a bit sad to duplicate this nearly identical code twice. And I
wonder if other gmu_poll_timeout()'s need similar treatment? Maybe
Akhil has an opinion about whether we should just build this into
gmu_poll_timeout() instead?
BR,
-R
>
> if (ret)
> DRM_DEV_ERROR(gmu->dev,
> diff --git a/drivers/gpu/drm/msm/adreno/a6xx_hfi.c b/drivers/gpu/drm/msm/adreno/a6xx_hfi.c
> index 8e69b1e8465711837151725c8f70e7b4b16a368e..4e775ca757ce3649ac238d25cebfd7eb693fda61 100644
> --- a/drivers/gpu/drm/msm/adreno/a6xx_hfi.c
> +++ b/drivers/gpu/drm/msm/adreno/a6xx_hfi.c
> @@ -104,10 +104,25 @@ static int a6xx_hfi_wait_for_msg_interrupt(struct a6xx_gmu *gmu, u32 id, u32 seq
> {
> int ret;
> u32 val;
> + struct a6xx_gpu *a6xx_gpu = container_of(gmu, struct a6xx_gpu, gmu);
> +
> + do {
> + /* Wait for a response */
> + ret = gmu_poll_timeout(gmu, REG_A6XX_GMU_GMU2HOST_INTR_INFO, val,
> + val & A6XX_GMU_GMU2HOST_INTR_INFO_MSGQ, 100, 1000000);
> +
> + if (!ret)
> + break;
>
> - /* Wait for a response */
> - ret = gmu_poll_timeout(gmu, REG_A6XX_GMU_GMU2HOST_INTR_INFO, val,
> - val & A6XX_GMU_GMU2HOST_INTR_INFO_MSGQ, 100, 1000000);
> + if (completion_done(&a6xx_gpu->base.fault_coredump_done))
> + break;
> +
> + /* We may timeout because the GMU is temporarily wedged from
> + * pending faults from the GPU and we are taking a devcoredump.
> + * Wait until the MMU is resumed and try again.
> + */
> + wait_for_completion(&a6xx_gpu->base.fault_coredump_done);
> + } while (true);
>
> if (ret) {
> DRM_DEV_ERROR(gmu->dev,
> diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.c b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
> index f1230465bf0d0840274a6eb03a10c4df3a7a68d3..19181b6fddfd518e2f60324da1a7087458115e40 100644
> --- a/drivers/gpu/drm/msm/adreno/adreno_gpu.c
> +++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
> @@ -293,6 +293,7 @@ int adreno_fault_handler(struct msm_gpu *gpu, unsigned long iova, int flags,
> struct adreno_smmu_fault_info *info, const char *block,
> u32 scratch[4])
> {
> + struct adreno_gpu *adreno_gpu = container_of(gpu, struct adreno_gpu, base);
> struct msm_drm_private *priv = gpu->dev->dev_private;
> struct msm_mmu *mmu = to_msm_vm(gpu->vm)->mmu;
> const char *type = "UNKNOWN";
> @@ -345,6 +346,11 @@ int adreno_fault_handler(struct msm_gpu *gpu, unsigned long iova, int flags,
> /* Turn off the hangcheck timer to keep it from bothering us */
> timer_delete(&gpu->hangcheck_timer);
>
> + /* Let any concurrent GMU transactions know that the MMU may be
> + * blocked for a while and they should wait on us.
> + */
> + reinit_completion(&adreno_gpu->fault_coredump_done);
> +
> fault_info.ttbr0 = info->ttbr0;
> fault_info.iova = iova;
> fault_info.flags = flags;
> @@ -352,6 +358,8 @@ int adreno_fault_handler(struct msm_gpu *gpu, unsigned long iova, int flags,
> fault_info.block = block;
>
> msm_gpu_fault_crashstate_capture(gpu, &fault_info);
> +
> + complete_all(&adreno_gpu->fault_coredump_done);
> }
>
> return 0;
> @@ -1238,6 +1246,9 @@ int adreno_gpu_init(struct drm_device *drm, struct platform_device *pdev,
> if (ret)
> return ret;
>
> + init_completion(&adreno_gpu->fault_coredump_done);
> + complete_all(&adreno_gpu->fault_coredump_done);
> +
> pm_runtime_set_autosuspend_delay(dev,
> adreno_gpu->info->inactive_period);
> pm_runtime_use_autosuspend(dev);
> diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.h b/drivers/gpu/drm/msm/adreno/adreno_gpu.h
> index 9dc93c247196d5b8b3659157f7aeea81809d4056..f16556c6f2921708e740ecd47f5b4668e87700aa 100644
> --- a/drivers/gpu/drm/msm/adreno/adreno_gpu.h
> +++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.h
> @@ -179,6 +179,8 @@ struct adreno_gpu {
> uint16_t speedbin;
> const struct adreno_gpu_funcs *funcs;
>
> + struct completion fault_coredump_done;
> +
> /* interesting register offsets to dump: */
> const unsigned int *registers;
>
>
> ---
> base-commit: 8290d37ad2b087bbcfe65fa5bcaf260e184b250a
> change-id: 20250718-msm-gmu-fault-wait-465543a718fa
>
> Best regards,
> --
> Connor Abbott <cwabbott0 at gmail.com>
>
More information about the Freedreno
mailing list