[Intel-gfx] [PATCH v5 17/18] drm/i915: Watchdog timeout: DRM kernel interface to set the timeout

Tue Mar 28 01:03:37 UTC 2017

On 25/03/17 02:38, Chris Wilson wrote:
> On Fri, Mar 24, 2017 at 06:30:09PM -0700, Michel Thierry wrote:
>> Final enablement patch for GPU hang detection using watchdog timeout.
>> Using the gem_context_setparam ioctl, users can specify the desired
>> timeout value in microseconds, and the driver will do the conversion to
>> 'timestamps'.
>>
>> The recommended default watchdog threshold for video engines is 60000 us,
>> since this has been _empirically determined_ to be a good compromise for
>> low-latency requirements and low rate of false positives. The default
>> register value is ~106000us and the theoretical max value (all 1s) is
>> 353 seconds.
>>
>> v2: Fixed get api to return values in microseconds. Threshold updated to
>> be per context engine. Check for u32 overflow. Capture ctx threshold
>> value in error state.
>>
>> Signed-off-by: Tomas Elf <tomas.elf at intel.com>
>> Signed-off-by: Arun Siluvery <arun.siluvery at linux.intel.com>
>> Signed-off-by: Michel Thierry <michel.thierry at intel.com>
>> ---
>>  drivers/gpu/drm/i915/i915_drv.h         |  1 +
>>  drivers/gpu/drm/i915/i915_gem_context.c | 78 +++++++++++++++++++++++++++++++++
>>  drivers/gpu/drm/i915/i915_gem_context.h | 20 +++++++++
>>  drivers/gpu/drm/i915/i915_gpu_error.c   | 11 +++--
>>  drivers/gpu/drm/i915/intel_lrc.c        |  2 +-
>>  include/uapi/drm/i915_drm.h             |  1 +
>>  6 files changed, 108 insertions(+), 5 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
>> index b43c37a911bb..1741584d858f 100644
>> --- a/drivers/gpu/drm/i915/i915_drv.h
>> +++ b/drivers/gpu/drm/i915/i915_drv.h
>> @@ -1039,6 +1039,7 @@ struct i915_gpu_state {
>>  			int ban_score;
>>  			int active;
>>  			int guilty;
>> +			int watchdog_threshold;
>
> Shouldn't this be added in the patch adding it to the context for real?
>

I wanted to wait until I could print it in the error_state using 
watchdog_to_us (which is added until this patch).
I can also move all the i915_gpu_error.c changes to a new patch.

>>  		} context;
>>
>>  		struct drm_i915_error_object {
>> diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
>> index edbed85a1c88..f5c126c0c681 100644
>> --- a/drivers/gpu/drm/i915/i915_gem_context.c
>> +++ b/drivers/gpu/drm/i915/i915_gem_context.c
>> @@ -422,6 +422,78 @@ i915_gem_context_create_gvt(struct drm_device *dev)
>>  	return ctx;
>>  }
>>
>> +/* Return the timer count threshold in microseconds. */
>> +int i915_gem_context_get_watchdog(struct i915_gem_context *ctx,
>> +				  struct drm_i915_gem_context_param *args)
>> +{
>> +	struct drm_i915_private *dev_priv = ctx->i915;
>> +	struct intel_engine_cs *engine;
>> +	enum intel_engine_id id;
>> +	u32 threshold_in_us[I915_NUM_ENGINES];
>
> Reading is a 2 step process. First CONTEXT_GET_PARAM passes in size==0
> and gets told by the kernel what size array to allocate. Second pass
> supplies a buffer with that size. (Or userspace can preallocate a large
> enough buffer, declare it's full size and let the kernel fill as much as
> it wants.)
>
> if (args->size == 0)
> 	goto out;
>
> if (args->size < sizeof(threshold_in_us))
> 	return -EFAULT;
EFAULT or EINVAL?

>> +
>> +	if (!dev_priv->engine[VCS]->emit_start_watchdog)
>> +		return -ENODEV;
>> +
>> +	for_each_engine(engine, dev_priv, id) {
>> +		struct intel_context *ce = &ctx->engine[id];
>> +
>> +		threshold_in_us[id] = watchdog_to_us(ce->watchdog_threshold);
>> +	}
>> +
>> +	mutex_unlock(&dev_priv->drm.struct_mutex);
>
> Grr. We should look at why we have this lock here in the first place.
> IIRC, it was there to make TRTT easier, but we can always move the
> burden of work again.
>

It helps TRTT (that code also takes an extra reference of the ctx)... 
but it all started in i915_gem_context_lookup (499f2697da1d)

>> +	if (__copy_to_user(u64_to_user_ptr(args->value),
>> +			   &threshold_in_us,
>> +			   sizeof(threshold_in_us))) {
>> +		mutex_lock(&dev_priv->drm.struct_mutex);
>> +		return -EFAULT;
>> +	}
>> +	mutex_lock(&dev_priv->drm.struct_mutex);
>
> out:
>> +	args->size = sizeof(threshold_in_us);
>> +
>> +	return 0;
>> +}
>> +
>> +/*
>> + * Based on time out value in microseconds (us) calculate
>> + * timer count thresholds needed based on core frequency.
>> + * Watchdog can be disabled by setting it to 0.
>> + */
>> +int i915_gem_context_set_watchdog(struct i915_gem_context *ctx,
>> +				  struct drm_i915_gem_context_param *args)
>> +{
>> +	struct drm_i915_private *dev_priv = ctx->i915;
>> +	struct intel_engine_cs *engine;
>> +	enum intel_engine_id id;
>> +	u32 threshold_in_us[I915_NUM_ENGINES];
>> +
>> +	if (!dev_priv->engine[VCS]->emit_start_watchdog)
>> +		return -ENODEV;
>> +	else if (args->size < sizeof(threshold_in_us))
>> +		return -EINVAL;
>
> args->size == 0 here could be used to mean one value for all? That might
> be a little clumsy compared to the getter, but easy for example for
> disabling the watchdog on all engines.
>
> if (args->size == 0) do_set_all()
> if (args->size < sizeof())
> 	return -EFAULT;
>
>> +
>> +	mutex_unlock(&dev_priv->drm.struct_mutex);
>> +	if (copy_from_user(&threshold_in_us,
>> +			   u64_to_user_ptr(args->value),
>> +			   sizeof(threshold_in_us))) {
>> +		mutex_lock(&dev_priv->drm.struct_mutex);
>> +		return -EFAULT;
>> +	}
>> +	mutex_lock(&dev_priv->drm.struct_mutex);
>> +
>> +	/* not supported in blitter engine */
>> +	if (threshold_in_us[BCS] != 0)
>> +		return -EINVAL;
>> +
>> +	for_each_engine(engine, dev_priv, id) {
>> +		struct intel_context *ce = &ctx->engine[id];
>> +
>> +		ce->watchdog_threshold =
>> +			watchdog_to_clock_counts((u64)threshold_in_us[id]);
>
> Cast is superfluous.
>

without it, the operation in watchdog_to_clock_counts was always casted 
to u32.

>> +	}
>> +
>> +	return 0;
>> +}
>> +
>>  int i915_gem_context_init(struct drm_i915_private *dev_priv)
>>  {
>>  	struct i915_gem_context *ctx;
>> @@ -1061,6 +1133,9 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data,
>>  	case I915_CONTEXT_PARAM_BANNABLE:
>>  		args->value = i915_gem_context_is_bannable(ctx);
>>  		break;
>> +	case I915_CONTEXT_PARAM_WATCHDOG:
>> +		ret = i915_gem_context_get_watchdog(ctx, args);
>> +		break;
>>  	default:
>>  		ret = -EINVAL;
>>  		break;
>> @@ -1118,6 +1193,9 @@ int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data,
>>  		else
>>  			i915_gem_context_clear_bannable(ctx);
>>  		break;
>> +	case I915_CONTEXT_PARAM_WATCHDOG:
>> +		ret = i915_gem_context_set_watchdog(ctx, args);
>> +		break;
>>  	default:
>>  		ret = -EINVAL;
>>  		break;
>> diff --git a/drivers/gpu/drm/i915/i915_gem_context.h b/drivers/gpu/drm/i915/i915_gem_context.h
>> index 88700bdbb4e1..6867b1fead8b 100644
>> --- a/drivers/gpu/drm/i915/i915_gem_context.h
>> +++ b/drivers/gpu/drm/i915/i915_gem_context.h
>> @@ -250,6 +250,26 @@ static inline bool i915_gem_context_is_kernel(struct i915_gem_context *ctx)
>>  	return !ctx->file_priv;
>>  }
>>
>> +/*
>> + * Timestamp timer resolution = 0.080 uSec,
>> + * or 12500000 counts per second, or ~12 counts per microsecond.
>> + */
>> +#define TIMESTAMP_CNTS_PER_USEC 12
>> +static inline u32 watchdog_to_us(u32 value_in_clock_counts)
>> +{
>> +	return (value_in_clock_counts) / (TIMESTAMP_CNTS_PER_USEC);
>> +}
>
> (brackets) (for fun) (?)
>
>> +
>> +static inline u32 watchdog_to_clock_counts(u64 value_in_us)
>> +{
>> +	u64 threshold = (value_in_us) * (TIMESTAMP_CNTS_PER_USEC);
>> +
>> +	if (GEM_WARN_ON(overflows_type(threshold, u32)))
>
> Nice idea, and yes we should do the range check. But this is a userspace
> interface, so always do it and don't warn, just -EINVAL.

What if it fails/overflows after some engines' thresholds have been set, 
should it set them back to 0's or leave them enable?

All other fixes added.

Thanks