[Intel-gfx] [RFC 3/3] drm/i915: Watchdog timeout: DRM kernel interface to set the timeout
Chris Wilson
chris at chris-wilson.co.uk
Thu Feb 23 21:14:54 UTC 2017
On Thu, Feb 23, 2017 at 11:44:19AM -0800, Michel Thierry wrote:
> Final enablement patch for GPU hang detection using watchdog timeout.
> Using the gem_context_setparam ioctl, users can specify the desired
> timeout value in milliseconds, and the driver will do the conversion to
> 'timestamps'.
>
> The _recommended_ default watchdog threshold for video engines is 60 ms,
> since this has been _empirically determined_ to be a good compromise for
> low-latency requirements and low rate of false positives. The default
> register value is ~106ms and the theoretical max value (all 1s) is
> 353 seconds.
>
> Signed-off-by: Tomas Elf <tomas.elf at intel.com>
> Signed-off-by: Arun Siluvery <arun.siluvery at linux.intel.com>
> Signed-off-by: Michel Thierry <michel.thierry at intel.com>
> ---
> drivers/gpu/drm/i915/i915_gem_context.c | 46 ++++++++++++++++++++++++++++++
> drivers/gpu/drm/i915/i915_gem_context.h | 3 ++
> drivers/gpu/drm/i915/i915_gem_execbuffer.c | 8 +-----
> include/uapi/drm/i915_drm.h | 1 +
> 4 files changed, 51 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
> index 99c46f4dbde6..c3748878e64c 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/i915_gem_context.c
> @@ -440,6 +440,32 @@ i915_gem_context_create_gvt(struct drm_device *dev)
> return ctx;
> }
>
> +void i915_context_watchdog_setup(struct i915_gem_context *ctx, u32 value_in_ms)
Ahem.
> +{
> + /*
> + * Based on time out value (ms) calculate
> + * timer count thresholds needed based on core frequency.
> + */
> +#define TIMER_MILLISECOND 1000
> +
> + /*
> + * Timestamp timer resolution = 0.080 uSec,
> + * or 12500000 counts per second
> + */
> +#define TIMESTAMP_CNTS_PER_SEC_80NS 12500000
> +
> + ctx->watchdog_threshold =
> + ((value_in_ms) *
> + ((TIMESTAMP_CNTS_PER_SEC_80NS) / (TIMER_MILLISECOND)));
Rescale to ns (u64 math), check for overflows before assigning to u32.
> + /*
> + * watchdog register must never be programmed to zero. This would
> + * cause the watchdog counter to exceed and not allow the engine to
> + * go into IDLE state
> + */
> + GEM_BUG_ON(ctx->watchdog_threshold == 0);
This throws a bug when the user tries to disable the watchdog
afterwards. [ctx->watchdog_threshold = 0 => disable watchdog around this
context, default].
> +}
> +
> int i915_gem_context_init(struct drm_i915_private *dev_priv)
> {
> struct i915_gem_context *ctx;
> @@ -1056,6 +1082,7 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data,
> struct drm_i915_file_private *file_priv = file->driver_priv;
> struct drm_i915_gem_context_param *args = data;
> struct i915_gem_context *ctx;
> + struct intel_engine_cs *engine;
> int ret;
>
> ret = i915_mutex_lock_interruptible(dev);
> @@ -1090,6 +1117,15 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data,
> case I915_CONTEXT_PARAM_BANNABLE:
> args->value = i915_gem_context_is_bannable(ctx);
> break;
> + case I915_CONTEXT_PARAM_WATCHDOG:
> + engine = to_i915(dev)->engine[VCS];
> + if (!engine->emit_start_watchdog)
> + ret = -EINVAL;
> + else if (args->value)
> + ret = -EINVAL;
> + else
> + args->value = ctx->watchdog_threshold;
Just
args->value = watchdog_to_ns(ctx->watchdog_threshold);
will be 0 where unset. On setting we complain if not supported.
Do we really want the user API to be in clock ticks rather than say ns?
I think we want ns, and don't forget to include that information in the
error state. Do we want to capture error state? Do we want this to
contribute to banning?
> + break;
> default:
> ret = -EINVAL;
> break;
> @@ -1105,6 +1141,7 @@ int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data,
> struct drm_i915_file_private *file_priv = file->driver_priv;
> struct drm_i915_gem_context_param *args = data;
> struct i915_gem_context *ctx;
> + struct intel_engine_cs *engine;
> int ret;
>
> ret = i915_mutex_lock_interruptible(dev);
> @@ -1147,6 +1184,15 @@ int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data,
> else
> i915_gem_context_clear_bannable(ctx);
> break;
> + case I915_CONTEXT_PARAM_WATCHDOG:
> + engine = to_i915(dev)->engine[VCS];
> + if (!engine->emit_start_watchdog)
> + ret = -EINVAL;
ret = -ENODEV;
> + else if (!args->value)
> + ret = -EINVAL;
> + else
> + i915_context_watchdog_setup(ctx, args->value);
I'm pushing for ns. And this should be
i915_gem_context_set_watchdog(ctx, args->value);
> + break;
> default:
> ret = -EINVAL;
> break;
> diff --git a/drivers/gpu/drm/i915/i915_gem_context.h b/drivers/gpu/drm/i915/i915_gem_context.h
> index 0ac750b90f3d..133ed7b413aa 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.h
> +++ b/drivers/gpu/drm/i915/i915_gem_context.h
> @@ -176,6 +176,9 @@ struct i915_gem_context {
> /** ban_score: Accumulated score of all hangs caused by this context. */
> int ban_score;
>
> + /** watchdog_threshold: hw watchdog threshold value, in clock counts */
> + u32 watchdog_threshold;
> +
> /** remap_slice: Bitmask of cache lines that need remapping */
> u8 remap_slice;
> };
> diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> index 348d81c40e81..26c50a6d6158 100644
> --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> @@ -1419,12 +1419,6 @@ execbuf_submit(struct i915_execbuffer_params *params,
> bool watchdog_running = false;
> int ret;
>
> - /*
> - * NB: Place-holder until watchdog timeout is enabled through DRM
> - * interface
> - */
> - bool enable_watchdog = false;
> -
> ret = i915_gem_execbuffer_move_to_gpu(params->request, vmas);
> if (ret)
> return ret;
> @@ -1488,7 +1482,7 @@ execbuf_submit(struct i915_execbuffer_params *params,
> }
>
> /* Start watchdog timer */
> - if (enable_watchdog) {
> + if (params->ctx->watchdog_threshold != 0) {
> if (!params->engine->emit_start_watchdog)
> return -EINVAL;
No. If we set a watchdog on a ctx, this then causes all use of BCS to
fail. We need saner API than that - either per-engine thresholds in the
ctx, or just document that the watchdog is only supported where
available by hw.
-Chris
--
Chris Wilson, Intel Open Source Technology Centre
More information about the Intel-gfx
mailing list