[Intel-gfx] [RFC 4/6] drm/i915: Allow userspace to configure the watchdog

Daniel Vetter daniel at ffwll.ch
Tue Mar 16 10:09:18 UTC 2021


On Fri, Mar 12, 2021 at 03:46:20PM +0000, Tvrtko Ursulin wrote:
> From: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
> 
> Idea here is to make the watchdog mechanism more useful than for just
> default request/fence expiry.
> 
> To this effect a new context param I915_CONTEXT_PARAM_WATCHDOG is added
> where the value fields allows passing in a timeout in micro-seconds.
> 
> This allows userspace to set a limit to how long they expect their batches
> to take, or otherwise they will be cancelled, and userspace notified via
> one of the available mechanisms.
> 
> Main attractiveness of adding uapi here is perhaps to extend the proposal
> by passing in a structure instead of a single value, like for illustration
> only:
> 
> struct drm_i915_gem_context_watchdog {
> 	__u64 flags;
>  #define I915_CONTEXT_WATCHDOG_WALL_TIME	BIT(0)
>  #define I915_CONTEXT_WATCHDOG_GPU_TIME		BIT(1)
>  #define I915_CONTEXT_WATCHDOG_FROM_SUBMIT	BIT(2)
>  #define I915_CONTEXT_WATCHDOG_FROM_RUNNABLE	BIT(3)
> 	__64 timeout_us;
> };
> 
> Point being to prepare the uapi for different semantics from the start.
> Given how not a single one makes complete sense for all use cases. And
> also perhaps satisfy the long wanted media watchdog feature request.
> 
> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
> Cc: Daniel Vetter <daniel.vetter at ffwll.ch>

I think what's been discussed forever is a gpu time watchdog for media.
Otherwise I don't think userspace should be allowed to configure anything
here, and even the watch really should only allow to fail faster.

Now for the workloads that we do break with the next patch (and those
exist), the real solution is the preempt-ctx dma_fence that amdkfd does,
and which we're also working on. But unfortunately that's a lot more work
than a quick simple patch.

So please drop this one from the next round. If media wants a fast
gpu timeout then we should roll that in as a separate series, with
userspace and igt and all the usual bells&whistles.
-Daniel

> ---
>  drivers/gpu/drm/i915/gem/i915_gem_context.c   | 57 +++++++++++++++++++
>  .../gpu/drm/i915/gem/i915_gem_context_types.h |  4 ++
>  drivers/gpu/drm/i915/gt/intel_context_param.h | 11 +++-
>  include/uapi/drm/i915_drm.h                   |  5 +-
>  4 files changed, 75 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c b/drivers/gpu/drm/i915/gem/i915_gem_context.c
> index ca37d93ef5e7..32b05af4fc8f 100644
> --- a/drivers/gpu/drm/i915/gem/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c
> @@ -233,6 +233,8 @@ static void intel_context_set_gem(struct intel_context *ce,
>  	if (ctx->sched.priority >= I915_PRIORITY_NORMAL &&
>  	    intel_engine_has_timeslices(ce->engine))
>  		__set_bit(CONTEXT_USE_SEMAPHORES, &ce->flags);
> +
> +	intel_context_set_watchdog_us(ce, ctx->watchdog.timeout_us);
>  }
>  
>  static void __free_engines(struct i915_gem_engines *e, unsigned int count)
> @@ -1397,6 +1399,28 @@ static int set_ringsize(struct i915_gem_context *ctx,
>  				 __intel_context_ring_size(args->value));
>  }
>  
> +static int __apply_watchdog(struct intel_context *ce, void *timeout_us)
> +{
> +	return intel_context_set_watchdog_us(ce, (uintptr_t)timeout_us);
> +}
> +
> +static int set_watchdog(struct i915_gem_context *ctx,
> +			struct drm_i915_gem_context_param *args)
> +{
> +	int ret;
> +
> +	if (args->size)
> +		return -EINVAL;
> +
> +	ret = context_apply_all(ctx, __apply_watchdog,
> +				(void *)(uintptr_t)args->value);
> +
> +	if (!ret)
> +		ctx->watchdog.timeout_us = args->value;
> +
> +	return ret;
> +}
> +
>  static int __get_ringsize(struct intel_context *ce, void *arg)
>  {
>  	long sz;
> @@ -1426,6 +1450,17 @@ static int get_ringsize(struct i915_gem_context *ctx,
>  	return 0;
>  }
>  
> +static int get_watchdog(struct i915_gem_context *ctx,
> +			struct drm_i915_gem_context_param *args)
> +{
> +	if (args->size)
> +		return -EINVAL;
> +
> +	args->value = ctx->watchdog.timeout_us;
> +
> +	return 0;
> +}
> +
>  int
>  i915_gem_user_to_context_sseu(struct intel_gt *gt,
>  			      const struct drm_i915_gem_context_param_sseu *user,
> @@ -2075,6 +2110,10 @@ static int ctx_setparam(struct drm_i915_file_private *fpriv,
>  		ret = set_ringsize(ctx, args);
>  		break;
>  
> +	case I915_CONTEXT_PARAM_WATCHDOG:
> +		ret = set_watchdog(ctx, args);
> +		break;
> +
>  	case I915_CONTEXT_PARAM_BAN_PERIOD:
>  	default:
>  		ret = -EINVAL;
> @@ -2196,6 +2235,19 @@ static int clone_schedattr(struct i915_gem_context *dst,
>  	return 0;
>  }
>  
> +static int clone_watchdog(struct i915_gem_context *dst,
> +			  struct i915_gem_context *src)
> +{
> +	int ret;
> +
> +	ret = context_apply_all(dst, __apply_watchdog,
> +				(void *)(uintptr_t)src->watchdog.timeout_us);
> +	if (!ret)
> +		dst->watchdog = src->watchdog;
> +
> +	return ret;
> +}
> +
>  static int clone_sseu(struct i915_gem_context *dst,
>  		      struct i915_gem_context *src)
>  {
> @@ -2279,6 +2331,7 @@ static int create_clone(struct i915_user_extension __user *ext, void *data)
>  		MAP(SSEU, clone_sseu),
>  		MAP(TIMELINE, clone_timeline),
>  		MAP(VM, clone_vm),
> +		MAP(WATCHDOG, clone_watchdog),
>  #undef MAP
>  	};
>  	struct drm_i915_gem_context_create_ext_clone local;
> @@ -2532,6 +2585,10 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data,
>  		ret = get_ringsize(ctx, args);
>  		break;
>  
> +	case I915_CONTEXT_PARAM_WATCHDOG:
> +		ret = get_watchdog(ctx, args);
> +		break;
> +
>  	case I915_CONTEXT_PARAM_BAN_PERIOD:
>  	default:
>  		ret = -EINVAL;
> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context_types.h b/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
> index d5bc75508048..f17da7e26c43 100644
> --- a/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
> +++ b/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
> @@ -150,6 +150,10 @@ struct i915_gem_context {
>  	 */
>  	atomic_t active_count;
>  
> +	struct {
> +		u64 timeout_us;
> +	} watchdog;
> +
>  	/**
>  	 * @hang_timestamp: The last time(s) this context caused a GPU hang
>  	 */
> diff --git a/drivers/gpu/drm/i915/gt/intel_context_param.h b/drivers/gpu/drm/i915/gt/intel_context_param.h
> index f053d8633fe2..3ecacc675f41 100644
> --- a/drivers/gpu/drm/i915/gt/intel_context_param.h
> +++ b/drivers/gpu/drm/i915/gt/intel_context_param.h
> @@ -6,9 +6,18 @@
>  #ifndef INTEL_CONTEXT_PARAM_H
>  #define INTEL_CONTEXT_PARAM_H
>  
> -struct intel_context;
> +#include <linux/types.h>
> +
> +#include "intel_context.h"
>  
>  int intel_context_set_ring_size(struct intel_context *ce, long sz);
>  long intel_context_get_ring_size(struct intel_context *ce);
>  
> +static inline int
> +intel_context_set_watchdog_us(struct intel_context *ce, u64 timeout_us)
> +{
> +	ce->watchdog.timeout_us = timeout_us;
> +	return 0;
> +}
> +
>  #endif /* INTEL_CONTEXT_PARAM_H */
> diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
> index 1987e2ea79a3..a4c65780850c 100644
> --- a/include/uapi/drm/i915_drm.h
> +++ b/include/uapi/drm/i915_drm.h
> @@ -1694,6 +1694,8 @@ struct drm_i915_gem_context_param {
>   * Default is 16 KiB.
>   */
>  #define I915_CONTEXT_PARAM_RINGSIZE	0xc
> +
> +#define I915_CONTEXT_PARAM_WATCHDOG	0xd
>  /* Must be kept compact -- no holes and well documented */
>  
>  	__u64 value;
> @@ -1863,7 +1865,8 @@ struct drm_i915_gem_context_create_ext_clone {
>  #define I915_CONTEXT_CLONE_SSEU		(1u << 3)
>  #define I915_CONTEXT_CLONE_TIMELINE	(1u << 4)
>  #define I915_CONTEXT_CLONE_VM		(1u << 5)
> -#define I915_CONTEXT_CLONE_UNKNOWN -(I915_CONTEXT_CLONE_VM << 1)
> +#define I915_CONTEXT_CLONE_WATCHDOG	(1u << 6)
> +#define I915_CONTEXT_CLONE_UNKNOWN -(I915_CONTEXT_CLONE_WATCHDOG << 1)
>  	__u64 rsvd;
>  };
>  
> -- 
> 2.27.0
> 

-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch


More information about the Intel-gfx mailing list