[Intel-gfx] [CI 1/5] drm/i915: Allow disabling error capture
Chris Wilson
chris at chris-wilson.co.uk
Tue Oct 11 09:57:04 UTC 2016
On Tue, Oct 11, 2016 at 12:52:00PM +0300, Jani Nikula wrote:
> On Tue, 11 Oct 2016, Chris Wilson <chris at chris-wilson.co.uk> wrote:
> > We currently capture the GPU state after we detect a hang. This is vital
> > for us to both triage and debug hangs in the wild (post-mortem
> > debugging). However, it comes at the cost of running some potentially
> > dangerous code (since it has to make very few assumption about the state
> > of the driver) that is quite resource intensive.
> >
> > Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> > Reviewed-by: Joonas Lahtinen <joonas.lahtinen at linux.intel.com>
> > ---
> > drivers/gpu/drm/i915/Kconfig | 10 ++++++++++
> > drivers/gpu/drm/i915/i915_debugfs.c | 6 ++++++
> > drivers/gpu/drm/i915/i915_drv.h | 16 ++++++++++++++++
> > drivers/gpu/drm/i915/i915_gpu_error.c | 7 +++++++
> > drivers/gpu/drm/i915/i915_params.c | 9 +++++++++
> > drivers/gpu/drm/i915/i915_params.h | 1 +
> > drivers/gpu/drm/i915/i915_sysfs.c | 8 ++++++++
> > drivers/gpu/drm/i915/intel_display.c | 4 ++++
> > drivers/gpu/drm/i915/intel_overlay.c | 4 ++++
> > 9 files changed, 65 insertions(+)
> >
> > diff --git a/drivers/gpu/drm/i915/Kconfig b/drivers/gpu/drm/i915/Kconfig
> > index 7769e469118f..10a6ac11b6a9 100644
> > --- a/drivers/gpu/drm/i915/Kconfig
> > +++ b/drivers/gpu/drm/i915/Kconfig
> > @@ -46,6 +46,16 @@ config DRM_I915_PRELIMINARY_HW_SUPPORT
> >
> > If in doubt, say "N".
> >
> > +config DRM_I915_CAPTURE_ERROR
> > + bool "Enable capturing GPU state following a hang"
> > + depends on DRM_I915
> > + default y
> > + help
> > + This option enables capturing the GPU state when a hang is detected.
> > + This information is vital for triaging hangs and assists in debugging.
> > +
> > + If in doubt, say "Y".
> > +
> > config DRM_I915_USERPTR
> > bool "Always enable userptr support"
> > depends on DRM_I915
> > diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
> > index 20689f1cd719..e4b5ba771bea 100644
> > --- a/drivers/gpu/drm/i915/i915_debugfs.c
> > +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> > @@ -960,6 +960,8 @@ static int i915_hws_info(struct seq_file *m, void *data)
> > return 0;
> > }
> >
> > +#ifdef CONFIG_DRM_I915_CAPTURE_ERROR
> > +
> > static ssize_t
> > i915_error_state_write(struct file *filp,
> > const char __user *ubuf,
> > @@ -1042,6 +1044,8 @@ static const struct file_operations i915_error_state_fops = {
> > .release = i915_error_state_release,
> > };
> >
> > +#endif
> > +
> > static int
> > i915_next_seqno_get(void *data, u64 *val)
> > {
> > @@ -5398,7 +5402,9 @@ static const struct i915_debugfs_files {
> > {"i915_ring_missed_irq", &i915_ring_missed_irq_fops},
> > {"i915_ring_test_irq", &i915_ring_test_irq_fops},
> > {"i915_gem_drop_caches", &i915_drop_caches_fops},
> > +#ifdef CONFIG_DRM_I915_CAPTURE_ERROR
> > {"i915_error_state", &i915_error_state_fops},
> > +#endif
> > {"i915_next_seqno", &i915_next_seqno_fops},
> > {"i915_display_crc_ctl", &i915_display_crc_ctl_fops},
> > {"i915_pri_wm_latency", &i915_pri_wm_latency_fops},
> > diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> > index 54d860e1c0fc..4570c4fa0287 100644
> > --- a/drivers/gpu/drm/i915/i915_drv.h
> > +++ b/drivers/gpu/drm/i915/i915_drv.h
> > @@ -3544,6 +3544,8 @@ static inline void intel_display_crc_init(struct drm_i915_private *dev_priv) {}
> > #endif
> >
> > /* i915_gpu_error.c */
> > +#ifdef CONFIG_DRM_I915_CAPTURE_ERROR
> > +
> > __printf(2, 3)
> > void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...);
> > int i915_error_state_to_str(struct drm_i915_error_state_buf *estr,
> > @@ -3564,6 +3566,20 @@ void i915_error_state_get(struct drm_device *dev,
> > void i915_error_state_put(struct i915_error_state_file_priv *error_priv);
> > void i915_destroy_error_state(struct drm_device *dev);
> >
> > +#else
> > +
> > +static inline void i915_capture_error_state(struct drm_i915_private *dev_priv,
> > + u32 engine_mask,
> > + const char *error_msg)
> > +{
> > +}
> > +
> > +static inline void i915_destroy_error_state(struct drm_device *dev)
> > +{
> > +}
> > +
> > +#endif
> > +
> > void i915_get_engine_instdone(struct drm_i915_private *dev_priv,
> > enum intel_engine_id engine_id,
> > struct intel_instdone *instdone);
> > diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
> > index b5b58692ac5a..9b395ffa3b6a 100644
> > --- a/drivers/gpu/drm/i915/i915_gpu_error.c
> > +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
> > @@ -30,6 +30,8 @@
> > #include <generated/utsrelease.h>
> > #include "i915_drv.h"
> >
> > +#ifdef CONFIG_DRM_I915_CAPTURE_ERROR
> > +
> > static const char *engine_str(int engine)
> > {
> > switch (engine) {
> > @@ -1464,6 +1466,9 @@ void i915_capture_error_state(struct drm_i915_private *dev_priv,
> > struct drm_i915_error_state *error;
> > unsigned long flags;
> >
> > + if (!i915.error_capture)
> > + return;
> > +
> > if (READ_ONCE(dev_priv->gpu_error.first_error))
> > return;
> >
> > @@ -1549,6 +1554,8 @@ void i915_destroy_error_state(struct drm_device *dev)
> > kref_put(&error->ref, i915_error_state_free);
> > }
> >
> > +#endif
> > +
> > const char *i915_cache_level_str(struct drm_i915_private *i915, int type)
> > {
> > switch (type) {
> > diff --git a/drivers/gpu/drm/i915/i915_params.c b/drivers/gpu/drm/i915/i915_params.c
> > index 768ad89d9cd4..e72a41223535 100644
> > --- a/drivers/gpu/drm/i915/i915_params.c
> > +++ b/drivers/gpu/drm/i915/i915_params.c
> > @@ -47,6 +47,7 @@ struct i915_params i915 __read_mostly = {
> > .load_detect_test = 0,
> > .force_reset_modeset_test = 0,
> > .reset = true,
> > + .error_capture = true,
> > .invert_brightness = 0,
> > .disable_display = 0,
> > .enable_cmd_parser = 1,
> > @@ -115,6 +116,14 @@ MODULE_PARM_DESC(vbt_sdvo_panel_type,
> > module_param_named_unsafe(reset, i915.reset, bool, 0600);
> > MODULE_PARM_DESC(reset, "Attempt GPU resets (default: true)");
> >
> > +#ifdef CONFIG_DRM_I915_CAPTURE_ERROR
> > +module_param_named(error_capture, i915.error_capture, bool, 0600);
> > +MODULE_PARM_DESC(error_capture,
> > + "Record the GPU state following a hang. "
> > + "This information in /sys/class/drm/card<N>/error is vital for "
> > + "triaging and debugging hangs.");
> > +#endif
>
> The addition of the module parameter is really orthogonal to the config
> option, and should be added separately.
>
> The question is, what is the target audience of
> CONFIG_DRM_I915_CAPTURE_ERROR=n? Is the added config #ifdeffery worth
> the trouble? It *is* a maintainability burden, because we will only ever
> be building with CONFIG_DRM_I915_CAPTURE_ERROR=y ourselves, and we'll
> screw it up anyway. Does i915.error_capture=0 accomplish the same thing
> (apart from not reducing code size)? Do we really need to be able to
> prevent the root from enabling error capture on the fly?
What I had in mind was that RT kernels will probably default to turning
off this facility (since it will cause latencies in the hundreds of ms
range).
> What would be wrong with just adding the module parameter, and making
> CONFIG_DRM_I915_CAPTURE_ERROR the default value for it? I.e.
>
> .error_capture = IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR),
>
> in the initialization. No #ifdefs anywhere.
I wanted to turn it off by default at compilation time.
-Chris
--
Chris Wilson, Intel Open Source Technology Centre
More information about the Intel-gfx
mailing list