[Intel-gfx] [PATCH v5 17/18] drm/i915: Watchdog timeout: DRM kernel interface to set the timeout

Daniele Ceraolo Spurio daniele.ceraolospurio at intel.com
Fri Apr 14 16:05:06 UTC 2017



On 24/03/17 18:30, Michel Thierry wrote:
> Final enablement patch for GPU hang detection using watchdog timeout.
> Using the gem_context_setparam ioctl, users can specify the desired
> timeout value in microseconds, and the driver will do the conversion to
> 'timestamps'.
>
> The recommended default watchdog threshold for video engines is 60000 us,
> since this has been _empirically determined_ to be a good compromise for
> low-latency requirements and low rate of false positives. The default
> register value is ~106000us and the theoretical max value (all 1s) is
> 353 seconds.
>
> v2: Fixed get api to return values in microseconds. Threshold updated to
> be per context engine. Check for u32 overflow. Capture ctx threshold
> value in error state.
>
> Signed-off-by: Tomas Elf <tomas.elf at intel.com>
> Signed-off-by: Arun Siluvery <arun.siluvery at linux.intel.com>
> Signed-off-by: Michel Thierry <michel.thierry at intel.com>
> ---
>  drivers/gpu/drm/i915/i915_drv.h         |  1 +
>  drivers/gpu/drm/i915/i915_gem_context.c | 78 +++++++++++++++++++++++++++++++++
>  drivers/gpu/drm/i915/i915_gem_context.h | 20 +++++++++
>  drivers/gpu/drm/i915/i915_gpu_error.c   | 11 +++--
>  drivers/gpu/drm/i915/intel_lrc.c        |  2 +-
>  include/uapi/drm/i915_drm.h             |  1 +
>  6 files changed, 108 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index b43c37a911bb..1741584d858f 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -1039,6 +1039,7 @@ struct i915_gpu_state {
>  			int ban_score;
>  			int active;
>  			int guilty;
> +			int watchdog_threshold;
>  		} context;
>
>  		struct drm_i915_error_object {
> diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
> index edbed85a1c88..f5c126c0c681 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/i915_gem_context.c
> @@ -422,6 +422,78 @@ i915_gem_context_create_gvt(struct drm_device *dev)
>  	return ctx;
>  }
>
> +/* Return the timer count threshold in microseconds. */
> +int i915_gem_context_get_watchdog(struct i915_gem_context *ctx,
> +				  struct drm_i915_gem_context_param *args)
> +{
> +	struct drm_i915_private *dev_priv = ctx->i915;
> +	struct intel_engine_cs *engine;
> +	enum intel_engine_id id;
> +	u32 threshold_in_us[I915_NUM_ENGINES];
> +
> +	if (!dev_priv->engine[VCS]->emit_start_watchdog)
> +		return -ENODEV;
> +
> +	for_each_engine(engine, dev_priv, id) {
> +		struct intel_context *ce = &ctx->engine[id];
> +
> +		threshold_in_us[id] = watchdog_to_us(ce->watchdog_threshold);
> +	}
> +
> +	mutex_unlock(&dev_priv->drm.struct_mutex);
> +	if (__copy_to_user(u64_to_user_ptr(args->value),
> +			   &threshold_in_us,
> +			   sizeof(threshold_in_us))) {
> +		mutex_lock(&dev_priv->drm.struct_mutex);
> +		return -EFAULT;
> +	}
> +	mutex_lock(&dev_priv->drm.struct_mutex);
> +	args->size = sizeof(threshold_in_us);
> +
> +	return 0;
> +}
> +
> +/*
> + * Based on time out value in microseconds (us) calculate
> + * timer count thresholds needed based on core frequency.
> + * Watchdog can be disabled by setting it to 0.
> + */
> +int i915_gem_context_set_watchdog(struct i915_gem_context *ctx,
> +				  struct drm_i915_gem_context_param *args)
> +{
> +	struct drm_i915_private *dev_priv = ctx->i915;
> +	struct intel_engine_cs *engine;
> +	enum intel_engine_id id;
> +	u32 threshold_in_us[I915_NUM_ENGINES];
> +
> +	if (!dev_priv->engine[VCS]->emit_start_watchdog)
> +		return -ENODEV;
> +	else if (args->size < sizeof(threshold_in_us))
> +		return -EINVAL;

won't we break userspace with this check if we ever get more engines on 
a new platform and thus bump I915_NUM_ENGINES?

Thanks,
Daniele

> +
> +	mutex_unlock(&dev_priv->drm.struct_mutex);
> +	if (copy_from_user(&threshold_in_us,
> +			   u64_to_user_ptr(args->value),
> +			   sizeof(threshold_in_us))) {
> +		mutex_lock(&dev_priv->drm.struct_mutex);
> +		return -EFAULT;
> +	}
> +	mutex_lock(&dev_priv->drm.struct_mutex);
> +
> +	/* not supported in blitter engine */
> +	if (threshold_in_us[BCS] != 0)
> +		return -EINVAL;
> +
> +	for_each_engine(engine, dev_priv, id) {
> +		struct intel_context *ce = &ctx->engine[id];
> +
> +		ce->watchdog_threshold =
> +			watchdog_to_clock_counts((u64)threshold_in_us[id]);
> +	}
> +
> +	return 0;
> +}
> +
>  int i915_gem_context_init(struct drm_i915_private *dev_priv)
>  {
>  	struct i915_gem_context *ctx;
> @@ -1061,6 +1133,9 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data,
>  	case I915_CONTEXT_PARAM_BANNABLE:
>  		args->value = i915_gem_context_is_bannable(ctx);
>  		break;
> +	case I915_CONTEXT_PARAM_WATCHDOG:
> +		ret = i915_gem_context_get_watchdog(ctx, args);
> +		break;
>  	default:
>  		ret = -EINVAL;
>  		break;
> @@ -1118,6 +1193,9 @@ int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data,
>  		else
>  			i915_gem_context_clear_bannable(ctx);
>  		break;
> +	case I915_CONTEXT_PARAM_WATCHDOG:
> +		ret = i915_gem_context_set_watchdog(ctx, args);
> +		break;
>  	default:
>  		ret = -EINVAL;
>  		break;
> diff --git a/drivers/gpu/drm/i915/i915_gem_context.h b/drivers/gpu/drm/i915/i915_gem_context.h
> index 88700bdbb4e1..6867b1fead8b 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.h
> +++ b/drivers/gpu/drm/i915/i915_gem_context.h
> @@ -250,6 +250,26 @@ static inline bool i915_gem_context_is_kernel(struct i915_gem_context *ctx)
>  	return !ctx->file_priv;
>  }
>
> +/*
> + * Timestamp timer resolution = 0.080 uSec,
> + * or 12500000 counts per second, or ~12 counts per microsecond.
> + */
> +#define TIMESTAMP_CNTS_PER_USEC 12
> +static inline u32 watchdog_to_us(u32 value_in_clock_counts)
> +{
> +	return (value_in_clock_counts) / (TIMESTAMP_CNTS_PER_USEC);
> +}
> +
> +static inline u32 watchdog_to_clock_counts(u64 value_in_us)
> +{
> +	u64 threshold = (value_in_us) * (TIMESTAMP_CNTS_PER_USEC);
> +
> +	if (GEM_WARN_ON(overflows_type(threshold, u32)))
> +		return 0;
> +
> +	return threshold;
> +}
> +
>  /* i915_gem_context.c */
>  int __must_check i915_gem_context_init(struct drm_i915_private *dev_priv);
>  void i915_gem_context_lost(struct drm_i915_private *dev_priv);
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
> index 5d015bcc7484..8acb83778030 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
> @@ -388,9 +388,10 @@ static void error_print_context(struct drm_i915_error_state_buf *m,
>  				const char *header,
>  				const struct drm_i915_error_context *ctx)
>  {
> -	err_printf(m, "%s%s[%d] user_handle %d hw_id %d, ban score %d guilty %d active %d\n",
> +	err_printf(m, "%s%s[%d] user_handle %d hw_id %d, ban score %d guilty %d active %d, watchdog %dus\n",
>  		   header, ctx->comm, ctx->pid, ctx->handle, ctx->hw_id,
> -		   ctx->ban_score, ctx->guilty, ctx->active);
> +		   ctx->ban_score, ctx->guilty, ctx->active,
> +		   watchdog_to_us(ctx->watchdog_threshold));
>  }
>
>  static void error_print_engine(struct drm_i915_error_state_buf *m,
> @@ -1336,7 +1337,8 @@ static void error_record_engine_execlists(struct intel_engine_cs *engine,
>  }
>
>  static void record_context(struct drm_i915_error_context *e,
> -			   struct i915_gem_context *ctx)
> +			   struct i915_gem_context *ctx,
> +			   u32 engine_id)
>  {
>  	if (ctx->pid) {
>  		struct task_struct *task;
> @@ -1355,6 +1357,7 @@ static void record_context(struct drm_i915_error_context *e,
>  	e->ban_score = ctx->ban_score;
>  	e->guilty = ctx->guilty_count;
>  	e->active = ctx->active_count;
> +	e->watchdog_threshold =	ctx->engine[engine_id].watchdog_threshold;
>  }
>
>  static void i915_gem_record_rings(struct drm_i915_private *dev_priv,
> @@ -1389,7 +1392,7 @@ static void i915_gem_record_rings(struct drm_i915_private *dev_priv,
>  			ee->vm = request->ctx->ppgtt ?
>  				&request->ctx->ppgtt->base : &ggtt->base;
>
> -			record_context(&ee->context, request->ctx);
> +			record_context(&ee->context, request->ctx, engine->id);
>
>  			/* We need to copy these to an anonymous buffer
>  			 * as the simplest method to avoid being overwritten
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 2736f642dc76..3f2b57a22338 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -1496,7 +1496,7 @@ static int gen8_emit_flush_render(struct drm_i915_gem_request *request,
>  	return 0;
>  }
>
> -#define GEN8_WATCHDOG_1000US 0x2ee0 //XXX: Temp, replace with helper function
> +#define GEN8_WATCHDOG_1000US watchdog_to_clock_counts(1000)
>  static void gen8_watchdog_irq_handler(unsigned long data)
>  {
>  	struct intel_engine_cs *engine = (struct intel_engine_cs *)data;
> diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
> index f083931a7809..448c9c0faa69 100644
> --- a/include/uapi/drm/i915_drm.h
> +++ b/include/uapi/drm/i915_drm.h
> @@ -1293,6 +1293,7 @@ struct drm_i915_gem_context_param {
>  #define I915_CONTEXT_PARAM_GTT_SIZE	0x3
>  #define I915_CONTEXT_PARAM_NO_ERROR_CAPTURE	0x4
>  #define I915_CONTEXT_PARAM_BANNABLE	0x5
> +#define I915_CONTEXT_PARAM_WATCHDOG	0x6
>  	__u64 value;
>  };
>
>


More information about the Intel-gfx mailing list