[PATCH v2 2/2] drm/msm: Hangcheck progress detection

Hillf Danton hdanton at sina.com
Wed Nov 2 02:09:17 UTC 2022


On 1 Nov 2022 15:33:10 -0700 Rob Clark <robdclark at chromium.org>
> --- a/drivers/gpu/drm/msm/msm_gpu.c
> +++ b/drivers/gpu/drm/msm/msm_gpu.c
> @@ -500,6 +500,21 @@ static void hangcheck_timer_reset(struct msm_gpu *gpu)
>  			round_jiffies_up(jiffies + msecs_to_jiffies(priv->hangcheck_period)));
>  }
>  
> +static bool made_progress(struct msm_gpu *gpu, struct msm_ringbuffer *ring)
> +{
> +	if (ring->hangcheck_progress_retries >= DRM_MSM_HANGCHECK_PROGRESS_RETRIES)
> +		return false;
> +
> +	if (!gpu->funcs->progress)
> +		return false;

Retry can not make difference without the progress callback provided.

> +
> +	if (!gpu->funcs->progress(gpu, ring))
> +		return false;
> +
> +	ring->hangcheck_progress_retries++;
> +	return true;
> +}
> +
>  static void hangcheck_handler(struct timer_list *t)
>  {
>  	struct msm_gpu *gpu = from_timer(gpu, t, hangcheck_timer);
> @@ -511,9 +526,12 @@ static void hangcheck_handler(struct timer_list *t)
>  	if (fence != ring->hangcheck_fence) {
>  		/* some progress has been made.. ya! */
>  		ring->hangcheck_fence = fence;
> -	} else if (fence_before(fence, ring->fctx->last_fence)) {
> +		ring->hangcheck_progress_retries = 0;
> +	} else if (fence_before(fence, ring->fctx->last_fence) &&
> +			!made_progress(gpu, ring)) {
>  		/* no progress and not done.. hung! */
>  		ring->hangcheck_fence = fence;
> +		ring->hangcheck_progress_retries = 0;
>  		DRM_DEV_ERROR(dev->dev, "%s: hangcheck detected gpu lockup rb %d!\n",
>  				gpu->name, ring->id);
>  		DRM_DEV_ERROR(dev->dev, "%s:     completed fence: %u\n",

Cutting DRM_MSM_HANGCHECK_DEFAULT_PERIOD down to 250ms leads to report of
false hang detected in case of no ->progress implemented.

> diff --git a/drivers/gpu/drm/msm/msm_gpu.h b/drivers/gpu/drm/msm/msm_gpu.h
> index 585fd9c8d45a..d8f355e9f0b2 100644
> --- a/drivers/gpu/drm/msm/msm_gpu.h
> +++ b/drivers/gpu/drm/msm/msm_gpu.h
> @@ -78,6 +78,8 @@ struct msm_gpu_funcs {
>  	struct msm_gem_address_space *(*create_private_address_space)
>  		(struct msm_gpu *gpu);
>  	uint32_t (*get_rptr)(struct msm_gpu *gpu, struct msm_ringbuffer *ring);
> +
> +	bool (*progress)(struct msm_gpu *gpu, struct msm_ringbuffer *ring);
>  };
>  
>  /* Additional state for iommu faults: */
> @@ -236,7 +238,8 @@ struct msm_gpu {
>  	 */
>  #define DRM_MSM_INACTIVE_PERIOD   66 /* in ms (roughly four frames) */
>  
> -#define DRM_MSM_HANGCHECK_DEFAULT_PERIOD 500 /* in ms */
> +#define DRM_MSM_HANGCHECK_DEFAULT_PERIOD 250 /* in ms */
> +#define DRM_MSM_HANGCHECK_PROGRESS_RETRIES 3


More information about the dri-devel mailing list