[Intel-gfx] [PATCH v4 4/5] drm/i915: Watchdog timeout: DRM kernel interface to set the timeout

Thu Feb 28 17:22:09 UTC 2019

On 21/02/2019 02:58, Carlos Santa wrote:
> From: Michel Thierry <michel.thierry at intel.com>
> 
> Final enablement patch for GPU hang detection using watchdog timeout.
> Using the gem_context_setparam ioctl, users can specify the desired
> timeout value in microseconds, and the driver will do the conversion to
> 'timestamps'.
> 
> The recommended default watchdog threshold for video engines is 60000 us,
> since this has been _empirically determined_ to be a good compromise for
> low-latency requirements and low rate of false positives. The default
> register value is ~106000us and the theoretical max value (all 1s) is
> 353 seconds.
> 
> [1] http://patchwork.freedesktop.org/patch/msgid/20170329135831.30254-2-chris@chris-wilson.co.uk
> 
> v2: Fixed get api to return values in microseconds. Threshold updated to
> be per context engine. Check for u32 overflow. Capture ctx threshold
> value in error state.
> 
> v3: Add a way to get array size, short-cut to disable all thresholds,
> return EFAULT / EINVAL as needed. Move the capture of the threshold
> value in the error state into a new patch. BXT has a different
> timestamp base (because why not?).
> 
> v4: Checking if watchdog is available should be the first thing to
> do, instead of giving false hopes to abi users; remove unnecessary & in
> set_watchdog; ignore args->size in getparam.
> 
> v5: GEN9-LP platforms have a different crystal clock frequency, use the
> right timestamp base for them (magic 8-ball predicts this will change
> again later on, so future-proof it). (Daniele)
> 
> v6: Rebase, no more mutex BLK in getparam_ioctl.
> 
> v7: use to_intel_context instead of ctx->engine.
> 
> v8: Rebase, remove extra mutex from i915_gem_context_set_watchdog (Tvrtko),
> Update UAPI to use engine class while keeping thresholds per
> engine class (Michel).
> 
> v9: Rebase,
>      Remove outdated comment from the commit message (Tvrtko)
>      Use the engine->flag to verify for gpu watchdog support (Tvrtko)
>      Use the standard copy_to_user() instead (Tvrtko)
>      Use the correct type when declaring engine class iterator (Tvrtko)
>      Remove yet another unncessary mutex_lock (Tvrtko)
> 
> Cc: Antonio Argenziano <antonio.argenziano at intel.com>
> Cc: Tvrtko Ursulin <tvrtko.ursulin at linux.intel.com>
> Cc: Daniele Ceraolo Spurio <daniele.ceraolospurio at intel.com>
> Signed-off-by: Michel Thierry <michel.thierry at intel.com>
> Signed-off-by: Carlos Santa <carlos.santa at intel.com>
> ---
>   drivers/gpu/drm/i915/i915_drv.h         | 50 +++++++++++++-
>   drivers/gpu/drm/i915/i915_gem_context.c | 91 +++++++++++++++++++++++++
>   include/uapi/drm/i915_drm.h             |  1 +
>   3 files changed, 141 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 0fcb2df869a2..aaa5810ba76c 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -1582,6 +1582,9 @@ struct drm_i915_private {
>   	struct drm_i915_fence_reg fence_regs[I915_MAX_NUM_FENCES]; /* assume 965 */
>   	int num_fence_regs; /* 8 on pre-965, 16 otherwise */
>   
> +	/* Command stream timestamp base - helps define watchdog threshold */
> +	u32 cs_timestamp_base;
> +
>   	unsigned int fsb_freq, mem_freq, is_ddr3;
>   	unsigned int skl_preferred_vco_freq;
>   	unsigned int max_cdclk_freq;
> @@ -3120,10 +3123,55 @@ i915_gem_context_lookup(struct drm_i915_file_private *file_priv, u32 id)
>   	return ctx;
>   }
>   
> +/*
> + * BDW, CHV & SKL+ Timestamp timer resolution = 0.080 uSec,
> + * or 12500000 counts per second, or ~12 counts per microsecond.
> + *
> + * But BXT/GLK Timestamp timer resolution is different, 0.052 uSec,
> + * or 19200000 counts per second, or ~19 counts per microsecond.
> + *
> + * Future-proofing, some day it won't be as simple as just GEN & IS_LP.
> + */
> +#define GEN8_TIMESTAMP_CNTS_PER_USEC 12
> +#define GEN9_LP_TIMESTAMP_CNTS_PER_USEC 19
> +static inline u32 cs_timestamp_in_us(struct drm_i915_private *dev_priv)

Probably let the compiler decide on the inline.

And s/dev_priv/i915/ is preferred in the GEM areas unless there are 
pesky I915_READ/WRITE around.

> +{
> +	u32 cs_timestamp_base = dev_priv->cs_timestamp_base;
> +
> +	if (cs_timestamp_base)
> +		return cs_timestamp_base;
> +
> +	switch (INTEL_GEN(dev_priv)) {
> +	default:
> +		MISSING_CASE(INTEL_GEN(dev_priv));
> +		/* fall through */
> +	case 9:
> +		cs_timestamp_base = IS_GEN9_LP(dev_priv) ?
> +					GEN9_LP_TIMESTAMP_CNTS_PER_USEC :
> +					GEN8_TIMESTAMP_CNTS_PER_USEC;
> +		break;
> +	case 8:
> +		cs_timestamp_base = GEN8_TIMESTAMP_CNTS_PER_USEC;
> +		break;
> +	}
> +
> +	dev_priv->cs_timestamp_base = cs_timestamp_base;
> +	return cs_timestamp_base;
> +}
> +
> +static inline u32
> +watchdog_to_us(struct drm_i915_private *dev_priv, u32 value_in_clock_counts)

Drop the inline as well.

> +{
> +	return value_in_clock_counts / cs_timestamp_in_us(dev_priv);
> +}
> +
>   static inline u32
>   watchdog_to_clock_counts(struct drm_i915_private *dev_priv, u64 value_in_us)

Here as well.

u32 for value_in_us should be enough according to the caller.

>   {
> -	u64 threshold = 0;
> +	u64 threshold = value_in_us * cs_timestamp_in_us(dev_priv);
> +
> +	if (overflows_type(threshold, u32))
> +		return -EINVAL;

You could use an u64 local for checking the overflow. And it is also a 
bit unusual to return -EINVAL in u32.

Maybe return an int success/error, or a bool, and value via pointer 
argument? Or return zero on overflow and in the caller check for it if 
passed in value was non-zero?

>   
>   	return threshold;
>   }
> diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
> index cbfe8f2eb3f2..e1abca28140b 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/i915_gem_context.c
> @@ -1573,6 +1573,89 @@ get_engines(struct i915_gem_context *ctx,
>   	return err;
>   }
>   
> +/* Return the timer count threshold in microseconds. */

Not really true even if it is talking about what it copies back to 
userspace.

> +int i915_gem_context_get_watchdog(struct i915_gem_context *ctx,
> +				  struct drm_i915_gem_context_param *args)
> +{
> +	struct drm_i915_private *dev_priv = ctx->i915;
> +	struct intel_engine_cs *engine;
> +	enum intel_engine_id id;
> +	u32 threshold_in_us[OTHER_CLASS];
> +
> +	if(!intel_engine_supports_watchdog(dev_priv->engine[VCS]))
> +		return -ENODEV;
> +
> +	for_each_engine(engine, dev_priv, id) {
> +		struct intel_context *ce = to_intel_context(ctx, engine);
> +
> +		threshold_in_us[engine->class] = watchdog_to_us(dev_priv,
> +								ce->watchdog_threshold);
> +	}
> +
> +	if (copy_to_user(u64_to_user_ptr(args->value),
> +			   &threshold_in_us,
> +			   sizeof(threshold_in_us))) {
> +		return -EFAULT;
> +	}
> +
> +	args->size = sizeof(threshold_in_us);
> +
> +	return 0;
> +}
> +
> +/*
> + * Based on time out value in microseconds (us) calculate
> + * timer count thresholds needed based on core frequency.
> + * Watchdog can be disabled by setting it to 0.
> + */
> +int i915_gem_context_set_watchdog(struct i915_gem_context *ctx,
> +				  struct drm_i915_gem_context_param *args)
> +{
> +	struct drm_i915_private *dev_priv = ctx->i915;
> +	struct intel_engine_cs *engine;
> +	enum intel_engine_id id;
> +	int i;
> +	u32 threshold[OTHER_CLASS];
> +
> +	if(!intel_engine_supports_watchdog(dev_priv->engine[VCS]))
> +		return -ENODEV;

You could check for each engine if the suggested uAPI was considered.

> +
> +	memset(threshold, 0, sizeof(threshold));
> +
> +	/* shortcut to disable in all engines */
> +	if (args->size == 0)
> +		goto set_watchdog;
> +
> +	if (args->size < sizeof(threshold))
> +		return -EFAULT; > +
> +	if (copy_from_user(threshold,
> +			   u64_to_user_ptr(args->value),
> +			   sizeof(threshold))) {
> +		return -EFAULT;
> +	}
> +
> +	/* not supported in blitter engine */
> +	if (threshold[COPY_ENGINE_CLASS] != 0)
> +		return -EINVAL;

You added engine_supports_watchdog.

> +
> +	for (i = RENDER_CLASS; i < OTHER_CLASS; i++) {
> +		threshold[i] = watchdog_to_clock_counts(dev_priv, threshold[i]);
> +
> +		if (threshold[i] == -EINVAL)
> +			return -EINVAL;
> +	}
> +
> +set_watchdog:
> +	for_each_engine(engine, dev_priv, id) {
> +		struct intel_context *ce = to_intel_context(ctx, engine);
> +
> +		ce->watchdog_threshold = threshold[engine->class];
> +	}
> +
> +	return 0;
> +}
> +
>   static int ctx_setparam(struct i915_gem_context *ctx,
>   			struct drm_i915_gem_context_param *args)
>   {
> @@ -1640,6 +1723,10 @@ static int ctx_setparam(struct i915_gem_context *ctx,
>   		ret = set_engines(ctx, args);
>   		break;
>   
> +	case I915_CONTEXT_PARAM_WATCHDOG:
> +		ret = i915_gem_context_set_watchdog(ctx, args);
> +		break;
> +
>   	case I915_CONTEXT_PARAM_BAN_PERIOD:
>   	default:
>   		ret = -EINVAL;
> @@ -1843,6 +1930,10 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data,
>   		args->value = ctx->sched.priority >> I915_USER_PRIORITY_SHIFT;
>   		break;
>   
> +	case I915_CONTEXT_PARAM_WATCHDOG:
> +		ret = i915_gem_context_get_watchdog(ctx, args);
> +		break;
> +
>   	case I915_CONTEXT_PARAM_SSEU:
>   		ret = get_sseu(ctx, args);
>   		break;
> diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
> index 3f2c89740b0e..7dabdb3e0fad 100644
> --- a/include/uapi/drm/i915_drm.h
> +++ b/include/uapi/drm/i915_drm.h
> @@ -1492,6 +1492,7 @@ struct drm_i915_gem_context_param {
>    * See struct i915_context_param_engines.
>    */
>   #define I915_CONTEXT_PARAM_ENGINES	0x9
> +#define I915_CONTEXT_PARAM_WATCHDOG	0x10
>   
>   	__u64 value;
>   };
> 

uAPI is still not documented in i915_drm.h and you have not considered 
the suggestion to use the array of structs, which would match the other 
bits of new uAPI we are adding. Like making ctx set param args->value 
point to an array of:

struct drm_i915_watchdog_timeout {
     struct {
         __u16 class;
         __u16 instance;
     };
     __u32 timeout_us;
};

Regards,

Tvrtko