[Intel-gfx] [PATCH 02/13] drm/i915: Stop the machine whilst capturing the GPU crash dump
Daniel Vetter
daniel at ffwll.ch
Fri Aug 5 18:50:11 UTC 2016
On Fri, Aug 05, 2016 at 10:05:53AM +0100, Chris Wilson wrote:
> The error state is purposefully racy as we expect it to be called at any
> time and so have avoided any locking whilst capturing the crash dump.
> However, with multi-engine GPUs and multiple CPUs, those races can
> manifest into OOPSes as we attempt to chase dangling pointers freed on
> other CPUs. Under discussion are lots of ways to slow down normal
> operation in order to protect the post-mortem error capture, but what it
> we take the opposite approach and freeze the machine whilst the error
> capture runs (note the GPU may still running, but as long as we don't
> process any of the results the driver's bookkeeping will be static).
>
> Note that by of itself, this is not a complete fix. It also depends on
> the compiler barriers in list_add/list_del to prevent traversing the
> lists into the void.
The other important bit I think are NULL checks. I think the commit
message should mention that too.
>
> v2: Avoid drm_clflush_pages() inside stop_machine() as it may use
> stop_machine() itself for its wbinvd fallback.
Droppingt he clflush from error capture seems like a pretty big
regression, at least at a glance. Why does this not result in piles of
corrupted error state captures?
I guess since we need to flush on incoherent platforms before gpu access
anyway this is mostly true (as long as userspace doesn't do something
silly), but I think it should be captured in a comment at the place of the
clflush.
-Daniel
>
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> ---
> drivers/gpu/drm/i915/Kconfig | 1 +
> drivers/gpu/drm/i915/i915_drv.h | 2 ++
> drivers/gpu/drm/i915/i915_gpu_error.c | 48 +++++++++++++++++++++--------------
> 3 files changed, 32 insertions(+), 19 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/Kconfig b/drivers/gpu/drm/i915/Kconfig
> index 7769e469118f..7badcee88ebf 100644
> --- a/drivers/gpu/drm/i915/Kconfig
> +++ b/drivers/gpu/drm/i915/Kconfig
> @@ -4,6 +4,7 @@ config DRM_I915
> depends on X86 && PCI
> select INTEL_GTT
> select INTERVAL_TREE
> + select STOP_MACHINE
> # we need shmfs for the swappable backing store, and in particular
> # the shmem_readpage() which depends upon tmpfs
> select SHMEM
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 7054baa92061..9a7ffa49cd1f 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -481,6 +481,8 @@ struct drm_i915_error_state {
> struct kref ref;
> struct timeval time;
>
> + struct drm_i915_private *i915;
> +
> char error_msg[128];
> bool simulated;
> int iommu;
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
> index ced296983caa..b94a59733cf8 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
> @@ -28,6 +28,7 @@
> */
>
> #include <generated/utsrelease.h>
> +#include <linux/stop_machine.h>
> #include "i915_drv.h"
>
> static const char *engine_str(int engine)
> @@ -684,14 +685,12 @@ i915_error_object_create(struct drm_i915_private *dev_priv,
>
> dst->page_count = num_pages;
> while (num_pages--) {
> - unsigned long flags;
> void *d;
>
> d = kmalloc(PAGE_SIZE, GFP_ATOMIC);
> if (d == NULL)
> goto unwind;
>
> - local_irq_save(flags);
> if (use_ggtt) {
> void __iomem *s;
>
> @@ -710,15 +709,10 @@ i915_error_object_create(struct drm_i915_private *dev_priv,
>
> page = i915_gem_object_get_page(src, i);
>
> - drm_clflush_pages(&page, 1);
> -
> s = kmap_atomic(page);
> memcpy(d, s, PAGE_SIZE);
> kunmap_atomic(s);
> -
> - drm_clflush_pages(&page, 1);
> }
> - local_irq_restore(flags);
>
> dst->pages[i++] = d;
> reloc_offset += PAGE_SIZE;
> @@ -1371,6 +1365,32 @@ static void i915_capture_gen_state(struct drm_i915_private *dev_priv,
> error->suspend_count = dev_priv->suspend_count;
> }
>
> +static int capture(void *data)
> +{
> + struct drm_i915_error_state *error = data;
> +
> + /* Ensure that what we readback from memory matches what the GPU sees */
> + wbinvd();
> +
> + i915_capture_gen_state(error->i915, error);
> + i915_capture_reg_state(error->i915, error);
> + i915_gem_record_fences(error->i915, error);
> + i915_gem_record_rings(error->i915, error);
> +
> + i915_capture_active_buffers(error->i915, error);
> + i915_capture_pinned_buffers(error->i915, error);
> +
> + do_gettimeofday(&error->time);
> +
> + error->overlay = intel_overlay_capture_error_state(error->i915);
> + error->display = intel_display_capture_error_state(error->i915);
> +
> + /* And make sure we don't leave trash in the CPU cache */
> + wbinvd();
> +
> + return 0;
> +}
> +
> /**
> * i915_capture_error_state - capture an error record for later analysis
> * @dev: drm device
> @@ -1399,19 +1419,9 @@ void i915_capture_error_state(struct drm_i915_private *dev_priv,
> }
>
> kref_init(&error->ref);
> + error->i915 = dev_priv;
>
> - i915_capture_gen_state(dev_priv, error);
> - i915_capture_reg_state(dev_priv, error);
> - i915_gem_record_fences(dev_priv, error);
> - i915_gem_record_rings(dev_priv, error);
> -
> - i915_capture_active_buffers(dev_priv, error);
> - i915_capture_pinned_buffers(dev_priv, error);
> -
> - do_gettimeofday(&error->time);
> -
> - error->overlay = intel_overlay_capture_error_state(dev_priv);
> - error->display = intel_display_capture_error_state(dev_priv);
> + stop_machine(capture, error, NULL);
>
> i915_error_capture_msg(dev_priv, error, engine_mask, error_msg);
> DRM_INFO("%s\n", error->error_msg);
> --
> 2.8.1
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
--
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch
More information about the Intel-gfx
mailing list