[PATCH] drm/msm: Wait for MMU devcoredump when waiting for GMU

Thu Jul 24 19:48:33 UTC 2025

On 7/21/2025 9:02 PM, Rob Clark wrote:
> On Fri, Jul 18, 2025 at 6:50 AM Connor Abbott <cwabbott0 at gmail.com> wrote:
>>
>> If there is a flood of faults then the MMU can become saturated while it
>> waits for the kernel to process the first fault and resume it, so that
>> the GMU becomes blocked. This is mainly a problem when the kernel reads
>> the state of the GPU for a devcoredump, because this takes a while. If
>> we timeout waiting for the GMU, check if this has happened and retry
>> after we're finished.
>>
>> Signed-off-by: Connor Abbott <cwabbott0 at gmail.com>
>> ---
>>  drivers/gpu/drm/msm/adreno/a6xx_gmu.c   | 21 ++++++++++++++++++---
>>  drivers/gpu/drm/msm/adreno/a6xx_hfi.c   | 21 ++++++++++++++++++---
>>  drivers/gpu/drm/msm/adreno/adreno_gpu.c | 11 +++++++++++
>>  drivers/gpu/drm/msm/adreno/adreno_gpu.h |  2 ++
>>  4 files changed, 49 insertions(+), 6 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gmu.c b/drivers/gpu/drm/msm/adreno/a6xx_gmu.c
>> index 28e6705c6da682c7b41c748e375dda59a6551898..6ec396fab22d194481a76d30b2d36ea5fb662241 100644
>> --- a/drivers/gpu/drm/msm/adreno/a6xx_gmu.c
>> +++ b/drivers/gpu/drm/msm/adreno/a6xx_gmu.c
>> @@ -340,6 +340,7 @@ int a6xx_gmu_set_oob(struct a6xx_gmu *gmu, enum a6xx_gmu_oob_state state)
>>         int ret;
>>         u32 val;
>>         int request, ack;
>> +       struct a6xx_gpu *a6xx_gpu = container_of(gmu, struct a6xx_gpu, gmu);
>>
>>         WARN_ON_ONCE(!mutex_is_locked(&gmu->lock));
>>
>> @@ -363,9 +364,23 @@ int a6xx_gmu_set_oob(struct a6xx_gmu *gmu, enum a6xx_gmu_oob_state state)
>>         /* Trigger the equested OOB operation */
>>         gmu_write(gmu, REG_A6XX_GMU_HOST2GMU_INTR_SET, 1 << request);
>>
>> -       /* Wait for the acknowledge interrupt */
>> -       ret = gmu_poll_timeout(gmu, REG_A6XX_GMU_GMU2HOST_INTR_INFO, val,
>> -               val & (1 << ack), 100, 10000);
>> +       do {
>> +               /* Wait for the acknowledge interrupt */
>> +               ret = gmu_poll_timeout(gmu, REG_A6XX_GMU_GMU2HOST_INTR_INFO, val,
>> +                       val & (1 << ack), 100, 10000);
>> +
>> +               if (!ret)
>> +                       break;
>> +
>> +               if (completion_done(&a6xx_gpu->base.fault_coredump_done))

I didn't get why this is required. Could you please add a comment?

>> +                       break;
>> +
>> +               /* We may timeout because the GMU is temporarily wedged from
>> +                * pending faults from the GPU and we are taking a devcoredump.
>> +                * Wait until the MMU is resumed and try again.
>> +                */
>> +               wait_for_completion(&a6xx_gpu->base.fault_coredump_done);
>> +       } while (true);
> 
> It is a bit sad to duplicate this nearly identical code twice.  And I
> wonder if other gmu_poll_timeout()'s need similar treatment?  Maybe
> Akhil has an opinion about whether we should just build this into
> gmu_poll_timeout() instead?

Yeah. That make sense. A potential issue I see is that we might be
holding both gpu and gmu locks here and the crashstate capture in the pf
handler tries to lock gpu, which can result in a dead lock.

-Akhil.

> 
> BR,
> -R
> 
>>
>>         if (ret)
>>                 DRM_DEV_ERROR(gmu->dev,
>> diff --git a/drivers/gpu/drm/msm/adreno/a6xx_hfi.c b/drivers/gpu/drm/msm/adreno/a6xx_hfi.c
>> index 8e69b1e8465711837151725c8f70e7b4b16a368e..4e775ca757ce3649ac238d25cebfd7eb693fda61 100644
>> --- a/drivers/gpu/drm/msm/adreno/a6xx_hfi.c
>> +++ b/drivers/gpu/drm/msm/adreno/a6xx_hfi.c
>> @@ -104,10 +104,25 @@ static int a6xx_hfi_wait_for_msg_interrupt(struct a6xx_gmu *gmu, u32 id, u32 seq
>>  {
>>         int ret;
>>         u32 val;
>> +       struct a6xx_gpu *a6xx_gpu = container_of(gmu, struct a6xx_gpu, gmu);
>> +
>> +       do {
>> +               /* Wait for a response */
>> +               ret = gmu_poll_timeout(gmu, REG_A6XX_GMU_GMU2HOST_INTR_INFO, val,
>> +                       val & A6XX_GMU_GMU2HOST_INTR_INFO_MSGQ, 100, 1000000);
>> +
>> +               if (!ret)
>> +                       break;
>>
>> -       /* Wait for a response */
>> -       ret = gmu_poll_timeout(gmu, REG_A6XX_GMU_GMU2HOST_INTR_INFO, val,
>> -               val & A6XX_GMU_GMU2HOST_INTR_INFO_MSGQ, 100, 1000000);
>> +               if (completion_done(&a6xx_gpu->base.fault_coredump_done))
>> +                       break;
>> +
>> +               /* We may timeout because the GMU is temporarily wedged from
>> +                * pending faults from the GPU and we are taking a devcoredump.
>> +                * Wait until the MMU is resumed and try again.
>> +                */
>> +               wait_for_completion(&a6xx_gpu->base.fault_coredump_done);
>> +       } while (true);
>>
>>         if (ret) {
>>                 DRM_DEV_ERROR(gmu->dev,
>> diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.c b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
>> index f1230465bf0d0840274a6eb03a10c4df3a7a68d3..19181b6fddfd518e2f60324da1a7087458115e40 100644
>> --- a/drivers/gpu/drm/msm/adreno/adreno_gpu.c
>> +++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
>> @@ -293,6 +293,7 @@ int adreno_fault_handler(struct msm_gpu *gpu, unsigned long iova, int flags,
>>                          struct adreno_smmu_fault_info *info, const char *block,
>>                          u32 scratch[4])
>>  {
>> +       struct adreno_gpu *adreno_gpu = container_of(gpu, struct adreno_gpu, base);
>>         struct msm_drm_private *priv = gpu->dev->dev_private;
>>         struct msm_mmu *mmu = to_msm_vm(gpu->vm)->mmu;
>>         const char *type = "UNKNOWN";
>> @@ -345,6 +346,11 @@ int adreno_fault_handler(struct msm_gpu *gpu, unsigned long iova, int flags,
>>                 /* Turn off the hangcheck timer to keep it from bothering us */
>>                 timer_delete(&gpu->hangcheck_timer);
>>
>> +               /* Let any concurrent GMU transactions know that the MMU may be
>> +                * blocked for a while and they should wait on us.
>> +                */
>> +               reinit_completion(&adreno_gpu->fault_coredump_done);
>> +
>>                 fault_info.ttbr0 = info->ttbr0;
>>                 fault_info.iova  = iova;
>>                 fault_info.flags = flags;
>> @@ -352,6 +358,8 @@ int adreno_fault_handler(struct msm_gpu *gpu, unsigned long iova, int flags,
>>                 fault_info.block = block;
>>
>>                 msm_gpu_fault_crashstate_capture(gpu, &fault_info);
>> +
>> +               complete_all(&adreno_gpu->fault_coredump_done);
>>         }
>>
>>         return 0;
>> @@ -1238,6 +1246,9 @@ int adreno_gpu_init(struct drm_device *drm, struct platform_device *pdev,
>>         if (ret)
>>                 return ret;
>>
>> +       init_completion(&adreno_gpu->fault_coredump_done);
>> +       complete_all(&adreno_gpu->fault_coredump_done);
>> +
>>         pm_runtime_set_autosuspend_delay(dev,
>>                 adreno_gpu->info->inactive_period);
>>         pm_runtime_use_autosuspend(dev);
>> diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.h b/drivers/gpu/drm/msm/adreno/adreno_gpu.h
>> index 9dc93c247196d5b8b3659157f7aeea81809d4056..f16556c6f2921708e740ecd47f5b4668e87700aa 100644
>> --- a/drivers/gpu/drm/msm/adreno/adreno_gpu.h
>> +++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.h
>> @@ -179,6 +179,8 @@ struct adreno_gpu {
>>         uint16_t speedbin;
>>         const struct adreno_gpu_funcs *funcs;
>>
>> +       struct completion fault_coredump_done;
>> +
>>         /* interesting register offsets to dump: */
>>         const unsigned int *registers;
>>
>>
>> ---
>> base-commit: 8290d37ad2b087bbcfe65fa5bcaf260e184b250a
>> change-id: 20250718-msm-gmu-fault-wait-465543a718fa
>>
>> Best regards,
>> --
>> Connor Abbott <cwabbott0 at gmail.com>
>>