[Intel-gfx] [RFC PATCH 08/12] drm/i915: Capture the PPGTT pagetables on a GPU crash

Oscar Mateo oscar.mateo at intel.com
Fri Oct 27 18:01:11 UTC 2017


Or, at least, the first thee levels (PML4, PDPs, PDs). We'll deal
with the PTs later, at the same time we record actual physical pages
with data.

We only do this when AubCrash is enabled, to save space, but
we could also do it unconditionally and maybe dump it in text
mode when in the legacy crash dump.

Signed-off-by: Oscar Mateo <oscar.mateo at intel.com>
c: Chris Wilson <chris at chris-wsilon.co.uk>
---
 drivers/gpu/drm/i915/i915_aubcrash.c  | 169 ++++++++++++++++++++++++++++++++++
 drivers/gpu/drm/i915/i915_aubcrash.h  |  14 +++
 drivers/gpu/drm/i915/i915_drv.h       |   6 ++
 drivers/gpu/drm/i915/i915_gpu_error.c |   8 ++
 4 files changed, 197 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_aubcrash.c b/drivers/gpu/drm/i915/i915_aubcrash.c
index 95b75ab..1b613e4 100644
--- a/drivers/gpu/drm/i915/i915_aubcrash.c
+++ b/drivers/gpu/drm/i915/i915_aubcrash.c
@@ -40,6 +40,175 @@
  *
  */
 
+#define COPY_PX_ENTRIES(px, storage) do { \
+	u64 *vaddr; \
+	if (!storage) \
+		return -ENOMEM; \
+	vaddr = kmap_atomic(px_base(px)->page); \
+	memcpy(storage, vaddr, PAGE_SIZE); \
+	kunmap_atomic(vaddr); \
+} while (0)
+
+int record_pml4(struct drm_i915_error_pagemap_lvl *e_pml4,
+		struct i915_pml4 *pml4,
+		struct i915_page_directory_pointer *scratch_pdp,
+		bool is_48bit)
+{
+	int l3;
+
+	if (is_48bit) {
+		e_pml4->paddr = px_dma(pml4);
+		e_pml4->storage = (u64 *)__get_free_page(GFP_ATOMIC | __GFP_NOWARN);
+		COPY_PX_ENTRIES(pml4, e_pml4->storage);
+		for (l3 = 0; l3 < GEN8_PML4ES_PER_PML4; l3++)
+			if (pml4->pdps[l3] != scratch_pdp)
+				e_pml4->nxt_lvl_count++;
+	} else
+		e_pml4->nxt_lvl_count = 1;
+
+	e_pml4->nxt_lvl = kcalloc(e_pml4->nxt_lvl_count,
+				  sizeof(*e_pml4->nxt_lvl), GFP_ATOMIC);
+	if (!e_pml4->nxt_lvl) {
+		e_pml4->nxt_lvl_count = 0;
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+int record_pdp(struct drm_i915_error_pagemap_lvl *e_pdp,
+	       struct i915_page_directory_pointer *pdp,
+	       bool is_48bit)
+{
+	if (is_48bit) {
+		e_pdp->paddr = px_dma(pdp);
+		e_pdp->storage = (u64 *)__get_free_page(GFP_ATOMIC | __GFP_NOWARN);
+		COPY_PX_ENTRIES(pdp, e_pdp->storage);
+	}
+
+	e_pdp->nxt_lvl_count = pdp->used_pdpes;
+	e_pdp->nxt_lvl = kcalloc(e_pdp->nxt_lvl_count,
+				  sizeof(*e_pdp->nxt_lvl), GFP_ATOMIC);
+	if (!e_pdp->nxt_lvl) {
+		e_pdp->nxt_lvl_count = 0;
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+int record_pd(struct drm_i915_error_pagemap_lvl *e_pd,
+	      struct i915_page_directory *pd)
+{
+	e_pd->paddr = px_dma(pd);
+	e_pd->storage = (u64 *)__get_free_page(GFP_ATOMIC | __GFP_NOWARN);
+	COPY_PX_ENTRIES(pd, e_pd->storage);
+
+	e_pd->nxt_lvl_count = pd->used_pdes;
+	e_pd->nxt_lvl = kcalloc(e_pd->nxt_lvl_count,
+				sizeof(*e_pd->nxt_lvl), GFP_ATOMIC);
+	if (!e_pd->nxt_lvl) {
+		e_pd->nxt_lvl_count = 0;
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+void i915_error_record_ppgtt(struct i915_gpu_state *error,
+			     struct i915_address_space *vm,
+			     int idx)
+{
+	struct i915_hw_ppgtt *ppgtt;
+	struct drm_i915_error_pagemap_lvl *e_pml4;
+	struct i915_pml4 *pml4;
+	int l3, l2, max_lvl3, max_lvl2, i, j;
+	bool is_48bit;
+	int ret;
+
+	if (i915_is_ggtt(vm))
+		return;
+
+	ppgtt = i915_vm_to_ppgtt(vm);
+	is_48bit = i915_vm_is_48bit(&ppgtt->base);
+	if (is_48bit) {
+		max_lvl3 = GEN8_PML4ES_PER_PML4;
+		max_lvl2 = GEN8_4LVL_PDPES;
+	} else {
+		max_lvl3 = 1;
+		max_lvl2 = GEN8_3LVL_PDPES;
+	}
+
+	/* PML4 */
+	pml4 = is_48bit? &ppgtt->pml4 : NULL;
+	e_pml4 = &error->ppgtt_pml4[idx];
+	ret = record_pml4(e_pml4, pml4, vm->scratch_pdp, is_48bit);
+	if (ret < 0)
+		return;
+
+	/* PDP */
+	for (l3 = 0, i = 0; l3 < max_lvl3; l3++) {
+		struct drm_i915_error_pagemap_lvl *e_pdp;
+		struct i915_page_directory_pointer *pdp;
+
+		pdp = is_48bit? pml4->pdps[l3] : &ppgtt->pdp;
+		if (pdp == vm->scratch_pdp)
+			continue;
+
+		e_pdp = &e_pml4->nxt_lvl[i];
+		ret = record_pdp(e_pdp, pdp, is_48bit);
+		if (ret < 0)
+			return;
+
+		/* PD */
+		for (l2 = 0, j = 0; l2 < max_lvl2; l2++) {
+			struct drm_i915_error_pagemap_lvl *e_pd;
+			struct i915_page_directory *pd;
+
+			pd = pdp->page_directory[l2];
+			if (pd == vm->scratch_pd)
+				continue;
+
+			e_pd = &e_pdp->nxt_lvl[j];
+			ret = record_pd(e_pd, pd);
+			if (ret < 0)
+				return;
+
+			if (++j == e_pdp->nxt_lvl_count)
+				break;
+		}
+
+		if (++i == e_pml4->nxt_lvl_count)
+			break;
+
+	}
+
+	/* XXX: Do I want to dump the scratch pdp/pd/pt/page? */
+	/* TODO: Support huge pages */
+}
+
+void i915_error_free_ppgtt(struct i915_gpu_state *error, int idx)
+{
+	struct drm_i915_error_pagemap_lvl *e_pml4 = &error->ppgtt_pml4[idx];
+	int i, j;
+
+	for (i = e_pml4->nxt_lvl_count - 1; i >= 0; i--) {
+		struct drm_i915_error_pagemap_lvl *e_pdp =
+			&e_pml4->nxt_lvl[i];
+
+		for (j = e_pdp->nxt_lvl_count - 1; j >= 0; j--) {
+			struct drm_i915_error_pagemap_lvl *e_pd =
+				&e_pdp->nxt_lvl[j];
+
+			free_page((unsigned long)e_pd->storage);
+			kfree(e_pd);
+		}
+		free_page((unsigned long)e_pdp->storage);
+		kfree(e_pdp);
+	}
+	free_page((unsigned long)e_pml4->storage);
+}
+
 int i915_error_state_to_aub(struct drm_i915_error_state_buf *m,
 			    const struct i915_gpu_state *error)
 {
diff --git a/drivers/gpu/drm/i915/i915_aubcrash.h b/drivers/gpu/drm/i915/i915_aubcrash.h
index bab1953..af7d42e 100644
--- a/drivers/gpu/drm/i915/i915_aubcrash.h
+++ b/drivers/gpu/drm/i915/i915_aubcrash.h
@@ -26,11 +26,25 @@
 
 #if IS_ENABLED(CONFIG_DRM_I915_AUB_CRASH_DUMP)
 
+void i915_error_record_ppgtt(struct i915_gpu_state *error,
+			     struct i915_address_space *vm,
+			     int idx);
+void i915_error_free_ppgtt(struct i915_gpu_state *error, int idx);
 int i915_error_state_to_aub(struct drm_i915_error_state_buf *m,
                             const struct i915_gpu_state *error);
 
 #else
 
+static inline void i915_error_record_ppgtt(struct i915_gpu_state *error,
+					   struct i915_address_space *vm,
+					   int idx)
+{
+}
+
+static inline void i915_error_free_ppgtt(struct i915_gpu_state *error, int idx)
+{
+}
+
 static inline int i915_error_state_to_aub(struct drm_i915_error_state_buf *m,
 					  const struct i915_gpu_state *error)
 {
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index ae3c8b1..9b2539a 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1048,6 +1048,12 @@ struct i915_gpu_state {
 		u32 cache_level:3;
 	} *active_bo[I915_NUM_ENGINES], *pinned_bo;
 	u32 active_bo_count[I915_NUM_ENGINES], pinned_bo_count;
+	struct drm_i915_error_pagemap_lvl {
+		phys_addr_t paddr;
+		u64 *storage;
+		struct drm_i915_error_pagemap_lvl *nxt_lvl;
+		uint nxt_lvl_count;
+	} ppgtt_pml4[I915_NUM_ENGINES];
 	struct i915_address_space *active_vm[I915_NUM_ENGINES];
 };
 
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index e71d2c8..39b69d8 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -31,6 +31,7 @@
 #include <linux/stop_machine.h>
 #include <linux/zlib.h>
 #include "i915_drv.h"
+#include "i915_aubcrash.h"
 
 static const char *engine_str(int engine)
 {
@@ -866,6 +867,9 @@ void __i915_gpu_state_free(struct kref *error_ref)
 	i915_error_object_free(error->semaphore);
 	i915_error_object_free(error->guc_log);
 
+	for (i = 0; i < ARRAY_SIZE(error->ppgtt_pml4); i++)
+		i915_error_free_ppgtt(error, i);
+
 	for (i = 0; i < ARRAY_SIZE(error->active_bo); i++)
 		kfree(error->active_bo[i]);
 	kfree(error->pinned_bo);
@@ -1520,6 +1524,9 @@ static void i915_gem_capture_vm(struct drm_i915_private *dev_priv,
 	else
 		count = 0;
 
+	if (INTEL_GEN(dev_priv) >= 8)
+		i915_error_record_ppgtt(error, vm, idx);
+
 	error->active_vm[idx] = vm;
 	error->active_bo[idx] = active_bo;
 	error->active_bo_count[idx] = count;
@@ -1533,6 +1540,7 @@ static void i915_capture_active_buffers(struct drm_i915_private *dev_priv,
 	BUILD_BUG_ON(ARRAY_SIZE(error->engine) > ARRAY_SIZE(error->active_bo));
 	BUILD_BUG_ON(ARRAY_SIZE(error->active_bo) != ARRAY_SIZE(error->active_vm));
 	BUILD_BUG_ON(ARRAY_SIZE(error->active_bo) != ARRAY_SIZE(error->active_bo_count));
+	BUILD_BUG_ON(ARRAY_SIZE(error->active_bo) != ARRAY_SIZE(error->ppgtt_pml4));
 
 	/* Scan each engine looking for unique active contexts/vm */
 	for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
-- 
1.9.1



More information about the Intel-gfx mailing list