[Intel-gfx] [PATCH v5 17/18] drm/i915: Watchdog timeout: DRM kernel interface to set the timeout
Michel Thierry
michel.thierry at intel.com
Tue Mar 28 01:03:37 UTC 2017
On 25/03/17 02:38, Chris Wilson wrote:
> On Fri, Mar 24, 2017 at 06:30:09PM -0700, Michel Thierry wrote:
>> Final enablement patch for GPU hang detection using watchdog timeout.
>> Using the gem_context_setparam ioctl, users can specify the desired
>> timeout value in microseconds, and the driver will do the conversion to
>> 'timestamps'.
>>
>> The recommended default watchdog threshold for video engines is 60000 us,
>> since this has been _empirically determined_ to be a good compromise for
>> low-latency requirements and low rate of false positives. The default
>> register value is ~106000us and the theoretical max value (all 1s) is
>> 353 seconds.
>>
>> v2: Fixed get api to return values in microseconds. Threshold updated to
>> be per context engine. Check for u32 overflow. Capture ctx threshold
>> value in error state.
>>
>> Signed-off-by: Tomas Elf <tomas.elf at intel.com>
>> Signed-off-by: Arun Siluvery <arun.siluvery at linux.intel.com>
>> Signed-off-by: Michel Thierry <michel.thierry at intel.com>
>> ---
>> drivers/gpu/drm/i915/i915_drv.h | 1 +
>> drivers/gpu/drm/i915/i915_gem_context.c | 78 +++++++++++++++++++++++++++++++++
>> drivers/gpu/drm/i915/i915_gem_context.h | 20 +++++++++
>> drivers/gpu/drm/i915/i915_gpu_error.c | 11 +++--
>> drivers/gpu/drm/i915/intel_lrc.c | 2 +-
>> include/uapi/drm/i915_drm.h | 1 +
>> 6 files changed, 108 insertions(+), 5 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
>> index b43c37a911bb..1741584d858f 100644
>> --- a/drivers/gpu/drm/i915/i915_drv.h
>> +++ b/drivers/gpu/drm/i915/i915_drv.h
>> @@ -1039,6 +1039,7 @@ struct i915_gpu_state {
>> int ban_score;
>> int active;
>> int guilty;
>> + int watchdog_threshold;
>
> Shouldn't this be added in the patch adding it to the context for real?
>
I wanted to wait until I could print it in the error_state using
watchdog_to_us (which is added until this patch).
I can also move all the i915_gpu_error.c changes to a new patch.
>> } context;
>>
>> struct drm_i915_error_object {
>> diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
>> index edbed85a1c88..f5c126c0c681 100644
>> --- a/drivers/gpu/drm/i915/i915_gem_context.c
>> +++ b/drivers/gpu/drm/i915/i915_gem_context.c
>> @@ -422,6 +422,78 @@ i915_gem_context_create_gvt(struct drm_device *dev)
>> return ctx;
>> }
>>
>> +/* Return the timer count threshold in microseconds. */
>> +int i915_gem_context_get_watchdog(struct i915_gem_context *ctx,
>> + struct drm_i915_gem_context_param *args)
>> +{
>> + struct drm_i915_private *dev_priv = ctx->i915;
>> + struct intel_engine_cs *engine;
>> + enum intel_engine_id id;
>> + u32 threshold_in_us[I915_NUM_ENGINES];
>
> Reading is a 2 step process. First CONTEXT_GET_PARAM passes in size==0
> and gets told by the kernel what size array to allocate. Second pass
> supplies a buffer with that size. (Or userspace can preallocate a large
> enough buffer, declare it's full size and let the kernel fill as much as
> it wants.)
>
> if (args->size == 0)
> goto out;
>
> if (args->size < sizeof(threshold_in_us))
> return -EFAULT;
EFAULT or EINVAL?
>> +
>> + if (!dev_priv->engine[VCS]->emit_start_watchdog)
>> + return -ENODEV;
>> +
>> + for_each_engine(engine, dev_priv, id) {
>> + struct intel_context *ce = &ctx->engine[id];
>> +
>> + threshold_in_us[id] = watchdog_to_us(ce->watchdog_threshold);
>> + }
>> +
>> + mutex_unlock(&dev_priv->drm.struct_mutex);
>
> Grr. We should look at why we have this lock here in the first place.
> IIRC, it was there to make TRTT easier, but we can always move the
> burden of work again.
>
It helps TRTT (that code also takes an extra reference of the ctx)...
but it all started in i915_gem_context_lookup (499f2697da1d)
>> + if (__copy_to_user(u64_to_user_ptr(args->value),
>> + &threshold_in_us,
>> + sizeof(threshold_in_us))) {
>> + mutex_lock(&dev_priv->drm.struct_mutex);
>> + return -EFAULT;
>> + }
>> + mutex_lock(&dev_priv->drm.struct_mutex);
>
> out:
>> + args->size = sizeof(threshold_in_us);
>> +
>> + return 0;
>> +}
>> +
>> +/*
>> + * Based on time out value in microseconds (us) calculate
>> + * timer count thresholds needed based on core frequency.
>> + * Watchdog can be disabled by setting it to 0.
>> + */
>> +int i915_gem_context_set_watchdog(struct i915_gem_context *ctx,
>> + struct drm_i915_gem_context_param *args)
>> +{
>> + struct drm_i915_private *dev_priv = ctx->i915;
>> + struct intel_engine_cs *engine;
>> + enum intel_engine_id id;
>> + u32 threshold_in_us[I915_NUM_ENGINES];
>> +
>> + if (!dev_priv->engine[VCS]->emit_start_watchdog)
>> + return -ENODEV;
>> + else if (args->size < sizeof(threshold_in_us))
>> + return -EINVAL;
>
> args->size == 0 here could be used to mean one value for all? That might
> be a little clumsy compared to the getter, but easy for example for
> disabling the watchdog on all engines.
>
> if (args->size == 0) do_set_all()
> if (args->size < sizeof())
> return -EFAULT;
>
>> +
>> + mutex_unlock(&dev_priv->drm.struct_mutex);
>> + if (copy_from_user(&threshold_in_us,
>> + u64_to_user_ptr(args->value),
>> + sizeof(threshold_in_us))) {
>> + mutex_lock(&dev_priv->drm.struct_mutex);
>> + return -EFAULT;
>> + }
>> + mutex_lock(&dev_priv->drm.struct_mutex);
>> +
>> + /* not supported in blitter engine */
>> + if (threshold_in_us[BCS] != 0)
>> + return -EINVAL;
>> +
>> + for_each_engine(engine, dev_priv, id) {
>> + struct intel_context *ce = &ctx->engine[id];
>> +
>> + ce->watchdog_threshold =
>> + watchdog_to_clock_counts((u64)threshold_in_us[id]);
>
> Cast is superfluous.
>
without it, the operation in watchdog_to_clock_counts was always casted
to u32.
>> + }
>> +
>> + return 0;
>> +}
>> +
>> int i915_gem_context_init(struct drm_i915_private *dev_priv)
>> {
>> struct i915_gem_context *ctx;
>> @@ -1061,6 +1133,9 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data,
>> case I915_CONTEXT_PARAM_BANNABLE:
>> args->value = i915_gem_context_is_bannable(ctx);
>> break;
>> + case I915_CONTEXT_PARAM_WATCHDOG:
>> + ret = i915_gem_context_get_watchdog(ctx, args);
>> + break;
>> default:
>> ret = -EINVAL;
>> break;
>> @@ -1118,6 +1193,9 @@ int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data,
>> else
>> i915_gem_context_clear_bannable(ctx);
>> break;
>> + case I915_CONTEXT_PARAM_WATCHDOG:
>> + ret = i915_gem_context_set_watchdog(ctx, args);
>> + break;
>> default:
>> ret = -EINVAL;
>> break;
>> diff --git a/drivers/gpu/drm/i915/i915_gem_context.h b/drivers/gpu/drm/i915/i915_gem_context.h
>> index 88700bdbb4e1..6867b1fead8b 100644
>> --- a/drivers/gpu/drm/i915/i915_gem_context.h
>> +++ b/drivers/gpu/drm/i915/i915_gem_context.h
>> @@ -250,6 +250,26 @@ static inline bool i915_gem_context_is_kernel(struct i915_gem_context *ctx)
>> return !ctx->file_priv;
>> }
>>
>> +/*
>> + * Timestamp timer resolution = 0.080 uSec,
>> + * or 12500000 counts per second, or ~12 counts per microsecond.
>> + */
>> +#define TIMESTAMP_CNTS_PER_USEC 12
>> +static inline u32 watchdog_to_us(u32 value_in_clock_counts)
>> +{
>> + return (value_in_clock_counts) / (TIMESTAMP_CNTS_PER_USEC);
>> +}
>
> (brackets) (for fun) (?)
>
>> +
>> +static inline u32 watchdog_to_clock_counts(u64 value_in_us)
>> +{
>> + u64 threshold = (value_in_us) * (TIMESTAMP_CNTS_PER_USEC);
>> +
>> + if (GEM_WARN_ON(overflows_type(threshold, u32)))
>
> Nice idea, and yes we should do the range check. But this is a userspace
> interface, so always do it and don't warn, just -EINVAL.
What if it fails/overflows after some engines' thresholds have been set,
should it set them back to 0's or leave them enable?
All other fixes added.
Thanks
More information about the Intel-gfx
mailing list