[PATCH 2/2] drm/msm: Hangcheck progress detection
Akhil P Oommen
quic_akhilpo at quicinc.com
Tue Nov 1 19:54:08 UTC 2022
On 11/1/2022 4:24 AM, Rob Clark wrote:
> From: Rob Clark <robdclark at chromium.org>
>
> If the hangcheck timer expires, check if the fw's position in the
> cmdstream has advanced (changed) since last timer expiration, and
> allow it up to three additional "extensions" to it's alotted time.
> The intention is to continue to catch "shader stuck in a loop" type
> hangs quickly, but allow more time for things that are actually
> making forward progress.
>
> Because we need to sample the CP state twice to detect if there has
> not been progress, this also cuts the the timer's duration in half.
>
> Signed-off-by: Rob Clark <robdclark at chromium.org>
> ---
> drivers/gpu/drm/msm/adreno/a6xx_gpu.c | 34 +++++++++++++++++++++++++++
> drivers/gpu/drm/msm/msm_drv.h | 8 ++++++-
> drivers/gpu/drm/msm/msm_gpu.c | 20 +++++++++++++++-
> drivers/gpu/drm/msm/msm_gpu.h | 5 +++-
> drivers/gpu/drm/msm/msm_ringbuffer.h | 24 +++++++++++++++++++
> 5 files changed, 88 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
> index 1ff605c18ee6..3b8fb7a11dff 100644
> --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
> +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
> @@ -1843,6 +1843,39 @@ static uint32_t a6xx_get_rptr(struct msm_gpu *gpu, struct msm_ringbuffer *ring)
> return ring->memptrs->rptr = gpu_read(gpu, REG_A6XX_CP_RB_RPTR);
> }
>
> +static bool a6xx_progress(struct msm_gpu *gpu, struct msm_ringbuffer *ring)
> +{
> + struct msm_cp_state cp_state = {
> + .ib1_base = gpu_read64(gpu, REG_A6XX_CP_IB1_BASE),
> + .ib2_base = gpu_read64(gpu, REG_A6XX_CP_IB2_BASE),
> + .ib1_rem = gpu_read(gpu, REG_A6XX_CP_IB1_REM_SIZE),
> + .ib2_rem = gpu_read(gpu, REG_A6XX_CP_IB2_REM_SIZE),
> + };
> + bool progress;
> +
> + /*
> + * Adjust the remaining data to account for what has already been
> + * fetched from memory, but not yet consumed by the SQE.
> + *
> + * This is not *technically* correct, the amount buffered could
> + * exceed the IB size due to hw prefetching ahead, but:
> + *
> + * (1) We aren't trying to find the exact position, just whether
> + * progress has been made
> + * (2) The CP_REG_TO_MEM at the end of a submit should be enough
> + * to prevent prefetching into an unrelated submit. (And
> + * either way, at some point the ROQ will be full.)
> + */
> + cp_state.ib1_rem += gpu_read(gpu, REG_A6XX_CP_CSQ_IB1_STAT) >> 16;
> + cp_state.ib2_rem += gpu_read(gpu, REG_A6XX_CP_CSQ_IB1_STAT) >> 16;
REG_A6XX_CP_CSQ_IB1_STAT -> REG_A6XX_CP_CSQ_IB2_STAT
With that, Reviewed-by: Akhil P Oommen <quic_akhilpo at quicinc.com>
-Akhil.
> +
> + progress = !!memcmp(&cp_state, &ring->last_cp_state, sizeof(cp_state));
> +
> + ring->last_cp_state = cp_state;
> +
> + return progress;
> +}
> +
> static u32 a618_get_speed_bin(u32 fuse)
> {
> if (fuse == 0)
> @@ -1961,6 +1994,7 @@ static const struct adreno_gpu_funcs funcs = {
> .create_address_space = a6xx_create_address_space,
> .create_private_address_space = a6xx_create_private_address_space,
> .get_rptr = a6xx_get_rptr,
> + .progress = a6xx_progress,
> },
> .get_timestamp = a6xx_get_timestamp,
> };
> diff --git a/drivers/gpu/drm/msm/msm_drv.h b/drivers/gpu/drm/msm/msm_drv.h
> index efcd7260f428..970a1a0ab34f 100644
> --- a/drivers/gpu/drm/msm/msm_drv.h
> +++ b/drivers/gpu/drm/msm/msm_drv.h
> @@ -226,7 +226,13 @@ struct msm_drm_private {
>
> struct drm_atomic_state *pm_state;
>
> - /* For hang detection, in ms */
> + /**
> + * hangcheck_period: For hang detection, in ms
> + *
> + * Note that in practice, a submit/job will get at least two hangcheck
> + * periods, due to checking for progress being implemented as simply
> + * "have the CP position registers changed since last time?"
> + */
> unsigned int hangcheck_period;
>
> /**
> diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c
> index 3dffee54a951..136f5977b0bf 100644
> --- a/drivers/gpu/drm/msm/msm_gpu.c
> +++ b/drivers/gpu/drm/msm/msm_gpu.c
> @@ -500,6 +500,21 @@ static void hangcheck_timer_reset(struct msm_gpu *gpu)
> round_jiffies_up(jiffies + msecs_to_jiffies(priv->hangcheck_period)));
> }
>
> +static bool made_progress(struct msm_gpu *gpu, struct msm_ringbuffer *ring)
> +{
> + if (ring->hangcheck_progress_retries >= DRM_MSM_HANGCHECK_PROGRESS_RETRIES)
> + return false;
> +
> + if (!gpu->funcs->progress)
> + return false;
> +
> + if (!gpu->funcs->progress(gpu, ring))
> + return false;
> +
> + ring->hangcheck_progress_retries++;
> + return true;
> +}
> +
> static void hangcheck_handler(struct timer_list *t)
> {
> struct msm_gpu *gpu = from_timer(gpu, t, hangcheck_timer);
> @@ -511,9 +526,12 @@ static void hangcheck_handler(struct timer_list *t)
> if (fence != ring->hangcheck_fence) {
> /* some progress has been made.. ya! */
> ring->hangcheck_fence = fence;
> - } else if (fence_before(fence, ring->fctx->last_fence)) {
> + ring->hangcheck_progress_retries = 0;
> + } else if (fence_before(fence, ring->fctx->last_fence) &&
> + !made_progress(gpu, ring)) {
> /* no progress and not done.. hung! */
> ring->hangcheck_fence = fence;
> + ring->hangcheck_progress_retries = 0;
> DRM_DEV_ERROR(dev->dev, "%s: hangcheck detected gpu lockup rb %d!\n",
> gpu->name, ring->id);
> DRM_DEV_ERROR(dev->dev, "%s: completed fence: %u\n",
> diff --git a/drivers/gpu/drm/msm/msm_gpu.h b/drivers/gpu/drm/msm/msm_gpu.h
> index 585fd9c8d45a..d8f355e9f0b2 100644
> --- a/drivers/gpu/drm/msm/msm_gpu.h
> +++ b/drivers/gpu/drm/msm/msm_gpu.h
> @@ -78,6 +78,8 @@ struct msm_gpu_funcs {
> struct msm_gem_address_space *(*create_private_address_space)
> (struct msm_gpu *gpu);
> uint32_t (*get_rptr)(struct msm_gpu *gpu, struct msm_ringbuffer *ring);
> +
> + bool (*progress)(struct msm_gpu *gpu, struct msm_ringbuffer *ring);
> };
>
> /* Additional state for iommu faults: */
> @@ -236,7 +238,8 @@ struct msm_gpu {
> */
> #define DRM_MSM_INACTIVE_PERIOD 66 /* in ms (roughly four frames) */
>
> -#define DRM_MSM_HANGCHECK_DEFAULT_PERIOD 500 /* in ms */
> +#define DRM_MSM_HANGCHECK_DEFAULT_PERIOD 250 /* in ms */
> +#define DRM_MSM_HANGCHECK_PROGRESS_RETRIES 3
> struct timer_list hangcheck_timer;
>
> /* Fault info for most recent iova fault: */
> diff --git a/drivers/gpu/drm/msm/msm_ringbuffer.h b/drivers/gpu/drm/msm/msm_ringbuffer.h
> index 2a5045abe46e..e3d33bae3380 100644
> --- a/drivers/gpu/drm/msm/msm_ringbuffer.h
> +++ b/drivers/gpu/drm/msm/msm_ringbuffer.h
> @@ -35,6 +35,11 @@ struct msm_rbmemptrs {
> volatile u64 ttbr0;
> };
>
> +struct msm_cp_state {
> + uint64_t ib1_base, ib2_base;
> + uint32_t ib1_rem, ib2_rem;
> +};
> +
I think we can add CP_RB_RPTR too here.
> struct msm_ringbuffer {
> struct msm_gpu *gpu;
> int id;
> @@ -64,6 +69,25 @@ struct msm_ringbuffer {
> uint64_t memptrs_iova;
> struct msm_fence_context *fctx;
>
> + /**
> + * hangcheck_progress_retries:
> + *
> + * The number of extra hangcheck duration cycles that we have given
> + * due to it appearing that the GPU is making forward progress.
> + *
> + * If the GPU appears to be making progress (ie. the CP has advanced
> + * in the command stream, we'll allow up to DRM_MSM_HANGCHECK_PROGRESS_RETRIES
> + * expirations of the hangcheck timer before killing the job. In other
> + * words we'll let the submit run for up to
> + * DRM_MSM_HANGCHECK_DEFAULT_PERIOD * DRM_MSM_HANGCHECK_PROGRESS_RETRIES
> + */
> + int hangcheck_progress_retries;
> +
> + /**
> + * last_cp_state: The state of the CP at the last call to gpu->progress()
> + */
> + struct msm_cp_state last_cp_state;
> +
> /*
> * preempt_lock protects preemption and serializes wptr updates against
> * preemption. Can be aquired from irq context.
More information about the dri-devel
mailing list