[Intel-gfx] [32/33] drm/i915/gt: Expose heartbeat interval via sysfs
Steve Carbonari
steven.carbonari at intel.com
Wed Jan 22 21:38:55 UTC 2020
On Thu, Dec 12, 2019 at 02:04:58PM +0000, Chris Wilson wrote:
> We monitor the health of the system via periodic heartbeat pulses. The
> pulses also provide the opportunity to perform garbage collection.
> However, we interpret an incomplete pulse (a missed heartbeat) as an
> indication that the system is no longer responsive, i.e. hung, and
> perform an engine or full GPU reset. Given that the preemption
> granularity can be very coarse on a system, we let the sysadmin override
> our legacy timeouts which were "optimised" for desktop applications.
>
> The heartbeat interval can be adjusted per-engine using,
>
> /sys/class/drm/card?/engine/*/heartbeat_interval_ms
>
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
Code looks good.
Performed light testing with other sysfs related patches in the series.
The heartbeat_interval_ms file exists and can be modified.
Reviewed-by: Steve Carbonari <steven.carbonari at intel.com>
Tested-by: Steve Carbonari <steven.carbonari at intel.com
> ---
> drivers/gpu/drm/i915/Kconfig.profile | 3 ++
> drivers/gpu/drm/i915/gt/intel_engine_sysfs.c | 47 ++++++++++++++++++++
> 2 files changed, 50 insertions(+)
>
> diff --git a/drivers/gpu/drm/i915/Kconfig.profile b/drivers/gpu/drm/i915/Kconfig.profile
> index 1f4e98a8532f..ba8767fc0d6e 100644
> --- a/drivers/gpu/drm/i915/Kconfig.profile
> +++ b/drivers/gpu/drm/i915/Kconfig.profile
> @@ -20,6 +20,9 @@ config DRM_I915_HEARTBEAT_INTERVAL
> check the health of the GPU and undertake regular house-keeping of
> internal driver state.
>
> + This is adjustable via
> + /sys/class/drm/card?/engine/*/heartbeat_interval_ms
> +
> May be 0 to disable heartbeats and therefore disable automatic GPU
> hang detection.
>
> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_sysfs.c b/drivers/gpu/drm/i915/gt/intel_engine_sysfs.c
> index d299c66cf7ec..33b4c00b93f2 100644
> --- a/drivers/gpu/drm/i915/gt/intel_engine_sysfs.c
> +++ b/drivers/gpu/drm/i915/gt/intel_engine_sysfs.c
> @@ -9,6 +9,7 @@
>
> #include "i915_drv.h"
> #include "intel_engine.h"
> +#include "intel_engine_heartbeat.h"
> #include "intel_engine_sysfs.h"
>
> struct kobj_engine {
> @@ -315,6 +316,49 @@ preempt_timeout_show(struct kobject *kobj, struct kobj_attribute *attr,
> static struct kobj_attribute preempt_timeout_attr =
> __ATTR(preempt_timeout_ms, 0644, preempt_timeout_show, preempt_timeout_store);
>
> +static ssize_t
> +heartbeat_store(struct kobject *kobj, struct kobj_attribute *attr,
> + const char *buf, size_t count)
> +{
> + struct intel_engine_cs *engine = kobj_to_engine(kobj);
> + unsigned long long delay;
> + int err;
> +
> + /*
> + * We monitor the health of the system via periodic heartbeat pulses.
> + * The pulses also provide the opportunity to perform garbage
> + * collection. However, we interpret an incomplete pulse (a missed
> + * heartbeat) as an indication that the system is no longer responsive,
> + * i.e. hung, and perform an engine or full GPU reset. Given that the
> + * preemption granularity can be very coarse on a system, the optimal
> + * value for any workload is unknowable!
> + */
> +
> + err = kstrtoull(buf, 0, &delay);
> + if (err)
> + return err;
> +
> + if (delay >= jiffies_to_msecs(MAX_SCHEDULE_TIMEOUT))
> + return -EINVAL;
> +
> + err = intel_engine_set_heartbeat(engine, delay);
> + if (err)
> + return err;
> +
> + return count;
> +}
> +
> +static ssize_t
> +heartbeat_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
> +{
> + struct intel_engine_cs *engine = kobj_to_engine(kobj);
> +
> + return sprintf(buf, "%lu\n", engine->props.heartbeat_interval_ms);
> +}
> +
> +static struct kobj_attribute heartbeat_interval_attr =
> +__ATTR(heartbeat_interval_ms, 0644, heartbeat_show, heartbeat_store);
> +
> static void kobj_engine_release(struct kobject *kobj)
> {
> kfree(kobj);
> @@ -357,6 +401,9 @@ void intel_engines_add_sysfs(struct drm_i915_private *i915)
> &all_caps_attr.attr,
> &max_spin_attr.attr,
> &stop_timeout_attr.attr,
> +#if CONFIG_DRM_I915_HEARTBEAT_INTERVAL
> + &heartbeat_interval_attr.attr,
> +#endif
> NULL
> };
>
More information about the Intel-gfx
mailing list