[Intel-gfx] [PATCH v5 17/18] drm/i915: Watchdog timeout: DRM kernel interface to set the timeout

Chris Wilson chris at chris-wilson.co.uk
Sat Mar 25 09:38:34 UTC 2017


On Fri, Mar 24, 2017 at 06:30:09PM -0700, Michel Thierry wrote:
> Final enablement patch for GPU hang detection using watchdog timeout.
> Using the gem_context_setparam ioctl, users can specify the desired
> timeout value in microseconds, and the driver will do the conversion to
> 'timestamps'.
> 
> The recommended default watchdog threshold for video engines is 60000 us,
> since this has been _empirically determined_ to be a good compromise for
> low-latency requirements and low rate of false positives. The default
> register value is ~106000us and the theoretical max value (all 1s) is
> 353 seconds.
> 
> v2: Fixed get api to return values in microseconds. Threshold updated to
> be per context engine. Check for u32 overflow. Capture ctx threshold
> value in error state.
> 
> Signed-off-by: Tomas Elf <tomas.elf at intel.com>
> Signed-off-by: Arun Siluvery <arun.siluvery at linux.intel.com>
> Signed-off-by: Michel Thierry <michel.thierry at intel.com>
> ---
>  drivers/gpu/drm/i915/i915_drv.h         |  1 +
>  drivers/gpu/drm/i915/i915_gem_context.c | 78 +++++++++++++++++++++++++++++++++
>  drivers/gpu/drm/i915/i915_gem_context.h | 20 +++++++++
>  drivers/gpu/drm/i915/i915_gpu_error.c   | 11 +++--
>  drivers/gpu/drm/i915/intel_lrc.c        |  2 +-
>  include/uapi/drm/i915_drm.h             |  1 +
>  6 files changed, 108 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index b43c37a911bb..1741584d858f 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -1039,6 +1039,7 @@ struct i915_gpu_state {
>  			int ban_score;
>  			int active;
>  			int guilty;
> +			int watchdog_threshold;

Shouldn't this be added in the patch adding it to the context for real?

>  		} context;
>  
>  		struct drm_i915_error_object {
> diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
> index edbed85a1c88..f5c126c0c681 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/i915_gem_context.c
> @@ -422,6 +422,78 @@ i915_gem_context_create_gvt(struct drm_device *dev)
>  	return ctx;
>  }
>  
> +/* Return the timer count threshold in microseconds. */
> +int i915_gem_context_get_watchdog(struct i915_gem_context *ctx,
> +				  struct drm_i915_gem_context_param *args)
> +{
> +	struct drm_i915_private *dev_priv = ctx->i915;
> +	struct intel_engine_cs *engine;
> +	enum intel_engine_id id;
> +	u32 threshold_in_us[I915_NUM_ENGINES];

Reading is a 2 step process. First CONTEXT_GET_PARAM passes in size==0
and gets told by the kernel what size array to allocate. Second pass
supplies a buffer with that size. (Or userspace can preallocate a large
enough buffer, declare it's full size and let the kernel fill as much as
it wants.)

if (args->size == 0)
	goto out;

if (args->size < sizeof(threshold_in_us))
	return -EFAULT;

> +
> +	if (!dev_priv->engine[VCS]->emit_start_watchdog)
> +		return -ENODEV;
> +
> +	for_each_engine(engine, dev_priv, id) {
> +		struct intel_context *ce = &ctx->engine[id];
> +
> +		threshold_in_us[id] = watchdog_to_us(ce->watchdog_threshold);
> +	}
> +
> +	mutex_unlock(&dev_priv->drm.struct_mutex);

Grr. We should look at why we have this lock here in the first place.
IIRC, it was there to make TRTT easier, but we can always move the
burden of work again.

> +	if (__copy_to_user(u64_to_user_ptr(args->value),
> +			   &threshold_in_us,
> +			   sizeof(threshold_in_us))) {
> +		mutex_lock(&dev_priv->drm.struct_mutex);
> +		return -EFAULT;
> +	}
> +	mutex_lock(&dev_priv->drm.struct_mutex);

out:
> +	args->size = sizeof(threshold_in_us);
> +
> +	return 0;
> +}
> +
> +/*
> + * Based on time out value in microseconds (us) calculate
> + * timer count thresholds needed based on core frequency.
> + * Watchdog can be disabled by setting it to 0.
> + */
> +int i915_gem_context_set_watchdog(struct i915_gem_context *ctx,
> +				  struct drm_i915_gem_context_param *args)
> +{
> +	struct drm_i915_private *dev_priv = ctx->i915;
> +	struct intel_engine_cs *engine;
> +	enum intel_engine_id id;
> +	u32 threshold_in_us[I915_NUM_ENGINES];
> +
> +	if (!dev_priv->engine[VCS]->emit_start_watchdog)
> +		return -ENODEV;
> +	else if (args->size < sizeof(threshold_in_us))
> +		return -EINVAL;

args->size == 0 here could be used to mean one value for all? That might
be a little clumsy compared to the getter, but easy for example for
disabling the watchdog on all engines.

if (args->size == 0) do_set_all()
if (args->size < sizeof())
	return -EFAULT;

> +
> +	mutex_unlock(&dev_priv->drm.struct_mutex);
> +	if (copy_from_user(&threshold_in_us,
> +			   u64_to_user_ptr(args->value),
> +			   sizeof(threshold_in_us))) {
> +		mutex_lock(&dev_priv->drm.struct_mutex);
> +		return -EFAULT;
> +	}
> +	mutex_lock(&dev_priv->drm.struct_mutex);
> +
> +	/* not supported in blitter engine */
> +	if (threshold_in_us[BCS] != 0)
> +		return -EINVAL;
> +
> +	for_each_engine(engine, dev_priv, id) {
> +		struct intel_context *ce = &ctx->engine[id];
> +
> +		ce->watchdog_threshold =
> +			watchdog_to_clock_counts((u64)threshold_in_us[id]);

Cast is superfluous.

> +	}
> +
> +	return 0;
> +}
> +
>  int i915_gem_context_init(struct drm_i915_private *dev_priv)
>  {
>  	struct i915_gem_context *ctx;
> @@ -1061,6 +1133,9 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data,
>  	case I915_CONTEXT_PARAM_BANNABLE:
>  		args->value = i915_gem_context_is_bannable(ctx);
>  		break;
> +	case I915_CONTEXT_PARAM_WATCHDOG:
> +		ret = i915_gem_context_get_watchdog(ctx, args);
> +		break;
>  	default:
>  		ret = -EINVAL;
>  		break;
> @@ -1118,6 +1193,9 @@ int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data,
>  		else
>  			i915_gem_context_clear_bannable(ctx);
>  		break;
> +	case I915_CONTEXT_PARAM_WATCHDOG:
> +		ret = i915_gem_context_set_watchdog(ctx, args);
> +		break;
>  	default:
>  		ret = -EINVAL;
>  		break;
> diff --git a/drivers/gpu/drm/i915/i915_gem_context.h b/drivers/gpu/drm/i915/i915_gem_context.h
> index 88700bdbb4e1..6867b1fead8b 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.h
> +++ b/drivers/gpu/drm/i915/i915_gem_context.h
> @@ -250,6 +250,26 @@ static inline bool i915_gem_context_is_kernel(struct i915_gem_context *ctx)
>  	return !ctx->file_priv;
>  }
>  
> +/*
> + * Timestamp timer resolution = 0.080 uSec,
> + * or 12500000 counts per second, or ~12 counts per microsecond.
> + */
> +#define TIMESTAMP_CNTS_PER_USEC 12
> +static inline u32 watchdog_to_us(u32 value_in_clock_counts)
> +{
> +	return (value_in_clock_counts) / (TIMESTAMP_CNTS_PER_USEC);
> +}

(brackets) (for fun) (?)

> +
> +static inline u32 watchdog_to_clock_counts(u64 value_in_us)
> +{
> +	u64 threshold = (value_in_us) * (TIMESTAMP_CNTS_PER_USEC);
> +
> +	if (GEM_WARN_ON(overflows_type(threshold, u32)))

Nice idea, and yes we should do the range check. But this is a userspace
interface, so always do it and don't warn, just -EINVAL.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre


More information about the Intel-gfx mailing list