[PATCH 5/5] drm/i915: Rely on spinlock protection for GPU error capture

Chris Wilson chris at chris-wilson.co.uk
Tue Jul 9 19:54:14 UTC 2019


Trust that we now have adequate protection over the low level structures
via the engine->active.lock to allow ourselves to capture the GPU error
state without the heavy hammer of stop_machine().

A useful side-effect is that this allows us to restore error capturing
for Braswell and Broxton.

Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_gem_gtt.c   |  5 --
 drivers/gpu/drm/i915/i915_gpu_error.c | 78 ++++++++++++---------------
 2 files changed, 35 insertions(+), 48 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
index 236c964dd761..e0645ce4fb84 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.c
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
@@ -3135,11 +3135,6 @@ static int gen8_gmch_probe(struct i915_ggtt *ggtt)
 		ggtt->vm.insert_page    = bxt_vtd_ggtt_insert_page__BKL;
 		if (ggtt->vm.clear_range != nop_clear_range)
 			ggtt->vm.clear_range = bxt_vtd_ggtt_clear_range__BKL;
-
-		/* Prevent recursively calling stop_machine() and deadlocks. */
-		dev_info(dev_priv->drm.dev,
-			 "Disabling error capture for VT-d workaround\n");
-		i915_disable_error_state(dev_priv, -ENODEV);
 	}
 
 	ggtt->invalidate = gen6_ggtt_invalidate;
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index f297a43df1e9..5374e57a56f8 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -30,7 +30,6 @@
 #include <linux/ascii85.h>
 #include <linux/nmi.h>
 #include <linux/scatterlist.h>
-#include <linux/stop_machine.h>
 #include <linux/utsname.h>
 #include <linux/zlib.h>
 
@@ -46,6 +45,8 @@
 #include "i915_scatterlist.h"
 #include "intel_csr.h"
 
+#define ALLOW_FAIL (GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN)
+
 static inline const struct intel_engine_cs *
 engine_lookup(const struct drm_i915_private *i915, unsigned int id)
 {
@@ -114,7 +115,7 @@ static bool __i915_error_grow(struct drm_i915_error_state_buf *e, size_t len)
 	if (e->cur == e->end) {
 		struct scatterlist *sgl;
 
-		sgl = (typeof(sgl))__get_free_page(GFP_KERNEL);
+		sgl = (typeof(sgl))__get_free_page(ALLOW_FAIL);
 		if (!sgl) {
 			e->err = -ENOMEM;
 			return false;
@@ -134,7 +135,7 @@ static bool __i915_error_grow(struct drm_i915_error_state_buf *e, size_t len)
 	}
 
 	e->size = ALIGN(len + 1, SZ_64K);
-	e->buf = kmalloc(e->size, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY);
+	e->buf = kmalloc(e->size, ALLOW_FAIL);
 	if (!e->buf) {
 		e->size = PAGE_ALIGN(len + 1);
 		e->buf = kmalloc(e->size, GFP_KERNEL);
@@ -224,7 +225,7 @@ static bool compress_init(struct compress *c)
 
 	zstream->workspace =
 		kmalloc(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL),
-			GFP_ATOMIC | __GFP_NOWARN);
+			ALLOW_FAIL);
 	if (!zstream->workspace)
 		return false;
 
@@ -235,7 +236,7 @@ static bool compress_init(struct compress *c)
 
 	c->tmp = NULL;
 	if (i915_has_memcpy_from_wc())
-		c->tmp = (void *)__get_free_page(GFP_ATOMIC | __GFP_NOWARN);
+		c->tmp = (void *)__get_free_page(ALLOW_FAIL);
 
 	return true;
 }
@@ -247,7 +248,7 @@ static void *compress_next_page(struct drm_i915_error_object *dst)
 	if (dst->page_count >= dst->num_pages)
 		return ERR_PTR(-ENOSPC);
 
-	page = __get_free_page(GFP_ATOMIC | __GFP_NOWARN);
+	page = __get_free_page(GFP_KERNEL | __GFP_NOWARN);
 	if (!page)
 		return ERR_PTR(-ENOMEM);
 
@@ -349,7 +350,7 @@ static int compress_page(struct compress *c,
 	unsigned long page;
 	void *ptr;
 
-	page = __get_free_page(GFP_ATOMIC | __GFP_NOWARN);
+	page = __get_free_page(ALLOW_FAIL);
 	if (!page)
 		return -ENOMEM;
 
@@ -1006,8 +1007,7 @@ i915_error_object_create(struct drm_i915_private *i915,
 
 	num_pages = min_t(u64, vma->size, vma->obj->base.size) >> PAGE_SHIFT;
 	num_pages = DIV_ROUND_UP(10 * num_pages, 8); /* worstcase zlib growth */
-	dst = kmalloc(sizeof(*dst) + num_pages * sizeof(u32 *),
-		      GFP_ATOMIC | __GFP_NOWARN);
+	dst = kmalloc(sizeof(*dst) + num_pages * sizeof(u32 *), ALLOW_FAIL);
 	if (!dst)
 		return NULL;
 
@@ -1281,7 +1281,7 @@ static void engine_record_requests(struct intel_engine_cs *engine,
 	if (!count)
 		return;
 
-	ee->requests = kcalloc(count, sizeof(*ee->requests), GFP_ATOMIC);
+	ee->requests = kcalloc(count, sizeof(*ee->requests), ALLOW_FAIL);
 	if (!ee->requests)
 		return;
 
@@ -1362,11 +1362,11 @@ static void request_record_user_bo(struct i915_request *request,
 	if (!max)
 		return;
 
-	bo = kmalloc_array(max, sizeof(*bo), GFP_ATOMIC);
+	bo = kmalloc_array(max, sizeof(*bo), ALLOW_FAIL);
 	if (!bo) {
 		/* If we can't capture everything, try to capture something. */
 		max = min_t(long, max, PAGE_SIZE / sizeof(*bo));
-		bo = kmalloc_array(max, sizeof(*bo), GFP_ATOMIC);
+		bo = kmalloc_array(max, sizeof(*bo), ALLOW_FAIL);
 	}
 	if (!bo)
 		return;
@@ -1491,7 +1491,7 @@ static void gem_capture_vm(struct i915_gpu_state *error,
 
 	active_bo = NULL;
 	if (count)
-		active_bo = kcalloc(count, sizeof(*active_bo), GFP_ATOMIC);
+		active_bo = kcalloc(count, sizeof(*active_bo), ALLOW_FAIL);
 	if (active_bo)
 		count = capture_error_bo(active_bo,
 					 count, &vm->bound_list,
@@ -1541,7 +1541,7 @@ static void capture_pinned_buffers(struct i915_gpu_state *error)
 
 	bo = NULL;
 	if (count)
-		bo = kcalloc(count, sizeof(*bo), GFP_ATOMIC);
+		bo = kcalloc(count, sizeof(*bo), ALLOW_FAIL);
 	if (!bo)
 		return;
 
@@ -1566,8 +1566,8 @@ static void capture_uc_state(struct i915_gpu_state *error)
 	 * As modparams are generally accesible from the userspace make
 	 * explicit copies of the firmware paths.
 	 */
-	error_uc->guc_fw.path = kstrdup(i915->guc.fw.path, GFP_ATOMIC);
-	error_uc->huc_fw.path = kstrdup(i915->huc.fw.path, GFP_ATOMIC);
+	error_uc->guc_fw.path = kstrdup(i915->guc.fw.path, ALLOW_FAIL);
+	error_uc->huc_fw.path = kstrdup(i915->huc.fw.path, ALLOW_FAIL);
 	error_uc->guc_log = i915_error_object_create(i915, i915->guc.log.vma);
 }
 
@@ -1752,9 +1752,26 @@ static void capture_finish(struct i915_gpu_state *error)
 	ggtt->vm.clear_range(&ggtt->vm, slot, PAGE_SIZE);
 }
 
-static int capture(void *data)
+#define DAY_AS_SECONDS(x) (24 * 60 * 60 * (x))
+
+struct i915_gpu_state *
+i915_capture_gpu_state(struct drm_i915_private *i915)
 {
-	struct i915_gpu_state *error = data;
+	struct i915_gpu_state *error;
+
+	/* Check if GPU capture has been disabled */
+	error = READ_ONCE(i915->gpu_error.first_error);
+	if (IS_ERR(error))
+		return error;
+
+	error = kzalloc(sizeof(*error), GFP_KERNEL | __GFP_NOWARN);
+	if (!error) {
+		i915_disable_error_state(i915, -ENOMEM);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	kref_init(&error->ref);
+	error->i915 = i915;
 
 	error->time = ktime_get_real();
 	error->boottime = ktime_get_boottime();
@@ -1777,31 +1794,6 @@ static int capture(void *data)
 	error->epoch = capture_find_epoch(error);
 
 	capture_finish(error);
-	return 0;
-}
-
-#define DAY_AS_SECONDS(x) (24 * 60 * 60 * (x))
-
-struct i915_gpu_state *
-i915_capture_gpu_state(struct drm_i915_private *i915)
-{
-	struct i915_gpu_state *error;
-
-	/* Check if GPU capture has been disabled */
-	error = READ_ONCE(i915->gpu_error.first_error);
-	if (IS_ERR(error))
-		return error;
-
-	error = kzalloc(sizeof(*error), GFP_ATOMIC);
-	if (!error) {
-		i915_disable_error_state(i915, -ENOMEM);
-		return ERR_PTR(-ENOMEM);
-	}
-
-	kref_init(&error->ref);
-	error->i915 = i915;
-
-	stop_machine(capture, error, NULL);
 
 	return error;
 }
-- 
2.22.0



More information about the Intel-gfx-trybot mailing list