[Intel-gfx] [32/33] drm/i915/gt: Expose heartbeat interval via sysfs

Steve Carbonari steven.carbonari at intel.com
Wed Jan 22 21:38:55 UTC 2020


On Thu, Dec 12, 2019 at 02:04:58PM +0000, Chris Wilson wrote:
> We monitor the health of the system via periodic heartbeat pulses. The
> pulses also provide the opportunity to perform garbage collection.
> However, we interpret an incomplete pulse (a missed heartbeat) as an
> indication that the system is no longer responsive, i.e. hung, and
> perform an engine or full GPU reset. Given that the preemption
> granularity can be very coarse on a system, we let the sysadmin override
> our legacy timeouts which were "optimised" for desktop applications.
> 
> The heartbeat interval can be adjusted per-engine using,
> 
> 	/sys/class/drm/card?/engine/*/heartbeat_interval_ms
> 
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

Code looks good.
Performed light testing with other sysfs related patches in the series.
The heartbeat_interval_ms file exists and can be modified.

Reviewed-by: Steve Carbonari <steven.carbonari at intel.com>
Tested-by: Steve Carbonari <steven.carbonari at intel.com


> ---
>  drivers/gpu/drm/i915/Kconfig.profile         |  3 ++
>  drivers/gpu/drm/i915/gt/intel_engine_sysfs.c | 47 ++++++++++++++++++++
>  2 files changed, 50 insertions(+)
> 
> diff --git a/drivers/gpu/drm/i915/Kconfig.profile b/drivers/gpu/drm/i915/Kconfig.profile
> index 1f4e98a8532f..ba8767fc0d6e 100644
> --- a/drivers/gpu/drm/i915/Kconfig.profile
> +++ b/drivers/gpu/drm/i915/Kconfig.profile
> @@ -20,6 +20,9 @@ config DRM_I915_HEARTBEAT_INTERVAL
>  	  check the health of the GPU and undertake regular house-keeping of
>  	  internal driver state.
>  
> +	  This is adjustable via
> +	  /sys/class/drm/card?/engine/*/heartbeat_interval_ms
> +
>  	  May be 0 to disable heartbeats and therefore disable automatic GPU
>  	  hang detection.
>  
> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_sysfs.c b/drivers/gpu/drm/i915/gt/intel_engine_sysfs.c
> index d299c66cf7ec..33b4c00b93f2 100644
> --- a/drivers/gpu/drm/i915/gt/intel_engine_sysfs.c
> +++ b/drivers/gpu/drm/i915/gt/intel_engine_sysfs.c
> @@ -9,6 +9,7 @@
>  
>  #include "i915_drv.h"
>  #include "intel_engine.h"
> +#include "intel_engine_heartbeat.h"
>  #include "intel_engine_sysfs.h"
>  
>  struct kobj_engine {
> @@ -315,6 +316,49 @@ preempt_timeout_show(struct kobject *kobj, struct kobj_attribute *attr,
>  static struct kobj_attribute preempt_timeout_attr =
>  __ATTR(preempt_timeout_ms, 0644, preempt_timeout_show, preempt_timeout_store);
>  
> +static ssize_t
> +heartbeat_store(struct kobject *kobj, struct kobj_attribute *attr,
> +		const char *buf, size_t count)
> +{
> +	struct intel_engine_cs *engine = kobj_to_engine(kobj);
> +	unsigned long long delay;
> +	int err;
> +
> +	/*
> +	 * We monitor the health of the system via periodic heartbeat pulses.
> +	 * The pulses also provide the opportunity to perform garbage
> +	 * collection.  However, we interpret an incomplete pulse (a missed
> +	 * heartbeat) as an indication that the system is no longer responsive,
> +	 * i.e. hung, and perform an engine or full GPU reset. Given that the
> +	 * preemption granularity can be very coarse on a system, the optimal
> +	 * value for any workload is unknowable!
> +	 */
> +
> +	err = kstrtoull(buf, 0, &delay);
> +	if (err)
> +		return err;
> +
> +	if (delay >= jiffies_to_msecs(MAX_SCHEDULE_TIMEOUT))
> +		return -EINVAL;
> +
> +	err = intel_engine_set_heartbeat(engine, delay);
> +	if (err)
> +		return err;
> +
> +	return count;
> +}
> +
> +static ssize_t
> +heartbeat_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
> +{
> +	struct intel_engine_cs *engine = kobj_to_engine(kobj);
> +
> +	return sprintf(buf, "%lu\n", engine->props.heartbeat_interval_ms);
> +}
> +
> +static struct kobj_attribute heartbeat_interval_attr =
> +__ATTR(heartbeat_interval_ms, 0644, heartbeat_show, heartbeat_store);
> +
>  static void kobj_engine_release(struct kobject *kobj)
>  {
>  	kfree(kobj);
> @@ -357,6 +401,9 @@ void intel_engines_add_sysfs(struct drm_i915_private *i915)
>  		&all_caps_attr.attr,
>  		&max_spin_attr.attr,
>  		&stop_timeout_attr.attr,
> +#if CONFIG_DRM_I915_HEARTBEAT_INTERVAL
> +		&heartbeat_interval_attr.attr,
> +#endif
>  		NULL
>  	};
>  


More information about the Intel-gfx mailing list