[PATCH] drm/msm: Wait for MMU devcoredump when waiting for GMU

Mon Jul 21 15:32:13 UTC 2025

On Fri, Jul 18, 2025 at 6:50 AM Connor Abbott <cwabbott0 at gmail.com> wrote:
>
> If there is a flood of faults then the MMU can become saturated while it
> waits for the kernel to process the first fault and resume it, so that
> the GMU becomes blocked. This is mainly a problem when the kernel reads
> the state of the GPU for a devcoredump, because this takes a while. If
> we timeout waiting for the GMU, check if this has happened and retry
> after we're finished.
>
> Signed-off-by: Connor Abbott <cwabbott0 at gmail.com>
> ---
>  drivers/gpu/drm/msm/adreno/a6xx_gmu.c   | 21 ++++++++++++++++++---
>  drivers/gpu/drm/msm/adreno/a6xx_hfi.c   | 21 ++++++++++++++++++---
>  drivers/gpu/drm/msm/adreno/adreno_gpu.c | 11 +++++++++++
>  drivers/gpu/drm/msm/adreno/adreno_gpu.h |  2 ++
>  4 files changed, 49 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gmu.c b/drivers/gpu/drm/msm/adreno/a6xx_gmu.c
> index 28e6705c6da682c7b41c748e375dda59a6551898..6ec396fab22d194481a76d30b2d36ea5fb662241 100644
> --- a/drivers/gpu/drm/msm/adreno/a6xx_gmu.c
> +++ b/drivers/gpu/drm/msm/adreno/a6xx_gmu.c
> @@ -340,6 +340,7 @@ int a6xx_gmu_set_oob(struct a6xx_gmu *gmu, enum a6xx_gmu_oob_state state)
>         int ret;
>         u32 val;
>         int request, ack;
> +       struct a6xx_gpu *a6xx_gpu = container_of(gmu, struct a6xx_gpu, gmu);
>
>         WARN_ON_ONCE(!mutex_is_locked(&gmu->lock));
>
> @@ -363,9 +364,23 @@ int a6xx_gmu_set_oob(struct a6xx_gmu *gmu, enum a6xx_gmu_oob_state state)
>         /* Trigger the equested OOB operation */
>         gmu_write(gmu, REG_A6XX_GMU_HOST2GMU_INTR_SET, 1 << request);
>
> -       /* Wait for the acknowledge interrupt */
> -       ret = gmu_poll_timeout(gmu, REG_A6XX_GMU_GMU2HOST_INTR_INFO, val,
> -               val & (1 << ack), 100, 10000);
> +       do {
> +               /* Wait for the acknowledge interrupt */
> +               ret = gmu_poll_timeout(gmu, REG_A6XX_GMU_GMU2HOST_INTR_INFO, val,
> +                       val & (1 << ack), 100, 10000);
> +
> +               if (!ret)
> +                       break;
> +
> +               if (completion_done(&a6xx_gpu->base.fault_coredump_done))
> +                       break;
> +
> +               /* We may timeout because the GMU is temporarily wedged from
> +                * pending faults from the GPU and we are taking a devcoredump.
> +                * Wait until the MMU is resumed and try again.
> +                */
> +               wait_for_completion(&a6xx_gpu->base.fault_coredump_done);
> +       } while (true);

It is a bit sad to duplicate this nearly identical code twice.  And I
wonder if other gmu_poll_timeout()'s need similar treatment?  Maybe
Akhil has an opinion about whether we should just build this into
gmu_poll_timeout() instead?

BR,
-R

>
>         if (ret)
>                 DRM_DEV_ERROR(gmu->dev,
> diff --git a/drivers/gpu/drm/msm/adreno/a6xx_hfi.c b/drivers/gpu/drm/msm/adreno/a6xx_hfi.c
> index 8e69b1e8465711837151725c8f70e7b4b16a368e..4e775ca757ce3649ac238d25cebfd7eb693fda61 100644
> --- a/drivers/gpu/drm/msm/adreno/a6xx_hfi.c
> +++ b/drivers/gpu/drm/msm/adreno/a6xx_hfi.c
> @@ -104,10 +104,25 @@ static int a6xx_hfi_wait_for_msg_interrupt(struct a6xx_gmu *gmu, u32 id, u32 seq
>  {
>         int ret;
>         u32 val;
> +       struct a6xx_gpu *a6xx_gpu = container_of(gmu, struct a6xx_gpu, gmu);
> +
> +       do {
> +               /* Wait for a response */
> +               ret = gmu_poll_timeout(gmu, REG_A6XX_GMU_GMU2HOST_INTR_INFO, val,
> +                       val & A6XX_GMU_GMU2HOST_INTR_INFO_MSGQ, 100, 1000000);
> +
> +               if (!ret)
> +                       break;
>
> -       /* Wait for a response */
> -       ret = gmu_poll_timeout(gmu, REG_A6XX_GMU_GMU2HOST_INTR_INFO, val,
> -               val & A6XX_GMU_GMU2HOST_INTR_INFO_MSGQ, 100, 1000000);
> +               if (completion_done(&a6xx_gpu->base.fault_coredump_done))
> +                       break;
> +
> +               /* We may timeout because the GMU is temporarily wedged from
> +                * pending faults from the GPU and we are taking a devcoredump.
> +                * Wait until the MMU is resumed and try again.
> +                */
> +               wait_for_completion(&a6xx_gpu->base.fault_coredump_done);
> +       } while (true);
>
>         if (ret) {
>                 DRM_DEV_ERROR(gmu->dev,
> diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.c b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
> index f1230465bf0d0840274a6eb03a10c4df3a7a68d3..19181b6fddfd518e2f60324da1a7087458115e40 100644
> --- a/drivers/gpu/drm/msm/adreno/adreno_gpu.c
> +++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
> @@ -293,6 +293,7 @@ int adreno_fault_handler(struct msm_gpu *gpu, unsigned long iova, int flags,
>                          struct adreno_smmu_fault_info *info, const char *block,
>                          u32 scratch[4])
>  {
> +       struct adreno_gpu *adreno_gpu = container_of(gpu, struct adreno_gpu, base);
>         struct msm_drm_private *priv = gpu->dev->dev_private;
>         struct msm_mmu *mmu = to_msm_vm(gpu->vm)->mmu;
>         const char *type = "UNKNOWN";
> @@ -345,6 +346,11 @@ int adreno_fault_handler(struct msm_gpu *gpu, unsigned long iova, int flags,
>                 /* Turn off the hangcheck timer to keep it from bothering us */
>                 timer_delete(&gpu->hangcheck_timer);
>
> +               /* Let any concurrent GMU transactions know that the MMU may be
> +                * blocked for a while and they should wait on us.
> +                */
> +               reinit_completion(&adreno_gpu->fault_coredump_done);
> +
>                 fault_info.ttbr0 = info->ttbr0;
>                 fault_info.iova  = iova;
>                 fault_info.flags = flags;
> @@ -352,6 +358,8 @@ int adreno_fault_handler(struct msm_gpu *gpu, unsigned long iova, int flags,
>                 fault_info.block = block;
>
>                 msm_gpu_fault_crashstate_capture(gpu, &fault_info);
> +
> +               complete_all(&adreno_gpu->fault_coredump_done);
>         }
>
>         return 0;
> @@ -1238,6 +1246,9 @@ int adreno_gpu_init(struct drm_device *drm, struct platform_device *pdev,
>         if (ret)
>                 return ret;
>
> +       init_completion(&adreno_gpu->fault_coredump_done);
> +       complete_all(&adreno_gpu->fault_coredump_done);
> +
>         pm_runtime_set_autosuspend_delay(dev,
>                 adreno_gpu->info->inactive_period);
>         pm_runtime_use_autosuspend(dev);
> diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.h b/drivers/gpu/drm/msm/adreno/adreno_gpu.h
> index 9dc93c247196d5b8b3659157f7aeea81809d4056..f16556c6f2921708e740ecd47f5b4668e87700aa 100644
> --- a/drivers/gpu/drm/msm/adreno/adreno_gpu.h
> +++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.h
> @@ -179,6 +179,8 @@ struct adreno_gpu {
>         uint16_t speedbin;
>         const struct adreno_gpu_funcs *funcs;
>
> +       struct completion fault_coredump_done;
> +
>         /* interesting register offsets to dump: */
>         const unsigned int *registers;
>
>
> ---
> base-commit: 8290d37ad2b087bbcfe65fa5bcaf260e184b250a
> change-id: 20250718-msm-gmu-fault-wait-465543a718fa
>
> Best regards,
> --
> Connor Abbott <cwabbott0 at gmail.com>
>