[PATCH 1/1] drm/amdgpu: Dump PDEs and PTEs on VM faults (v2)

Fri Jul 12 23:08:17 UTC 2019

Walk page table for the faulting address and dump PDEs and PTEs at
all levels. Also flag discrepancies where a PDE points to a different
address than the next level PDB or PTB BO.

v2:
* Fix address shift for GFXv8
* Redo PDB/PTB address checking to work on all generations

Signed-off-by: Felix Kuehling <Felix.Kuehling at amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c |  5 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h |  2 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c  | 87 ++++++++++++++++++++++++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h  |  7 +-
 drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c   |  6 +-
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c   |  5 +-
 6 files changed, 103 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index bbbf069efb77..78440748c87f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -1505,9 +1505,8 @@ static bool amdgpu_ttm_bo_eviction_valuable(struct ttm_buffer_object *bo,
  * This is used to access VRAM that backs a buffer object via MMIO
  * access for debugging purposes.
  */
-static int amdgpu_ttm_access_memory(struct ttm_buffer_object *bo,
-				    unsigned long offset,
-				    void *buf, int len, int write)
+int amdgpu_ttm_access_memory(struct ttm_buffer_object *bo, unsigned long offset,
+			     void *buf, int len, int write)
 {
 	struct amdgpu_bo *abo = ttm_to_amdgpu_bo(bo);
 	struct amdgpu_device *adev = amdgpu_ttm_adev(abo->tbo.bdev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
index bccb8c49e597..cffbafffa9d7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
@@ -83,6 +83,8 @@ void amdgpu_ttm_fini(struct amdgpu_device *adev);
 void amdgpu_ttm_set_buffer_funcs_status(struct amdgpu_device *adev,
 					bool enable);
 
+int amdgpu_ttm_access_memory(struct ttm_buffer_object *bo, unsigned long offset,
+			     void *buf, int len, int write);
 int amdgpu_copy_buffer(struct amdgpu_ring *ring, uint64_t src_offset,
 		       uint64_t dst_offset, uint32_t byte_count,
 		       struct reservation_object *resv,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index 1951f2abbdbc..38a2f66cf095 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -544,6 +544,86 @@ static void amdgpu_vm_pt_next_dfs(struct amdgpu_device *adev,
 	     amdgpu_vm_pt_continue_dfs((start), (entry));			\
 	     (entry) = (cursor).entry, amdgpu_vm_pt_next_dfs((adev), &(cursor)))
 
+/**
+ * amdgpu_vm_dump_pte - dump PTEs along a page table walk
+ *
+ * @adev: amdgpu device pointer
+ * @vm: VM address space
+ * @addr: virtual address
+ *
+ * Walks the page table of @vm at the given @addr and prints the PDEs
+ * and PTEs along the way on a single line.
+ */
+void amdgpu_vm_dump_pte(struct amdgpu_device *adev, struct amdgpu_vm *vm,
+			uint64_t addr)
+{
+	static const char *level_entry[4] = {"PDE2", "PDE1", "PDE0", "PTE"};
+	static const char *level_block[4] = {"PDB2", "PDB1", "PDB0", "PTB"};
+	struct amdgpu_vm_pt_cursor cursor;
+	uint64_t last_pde;
+	char buf[128];
+	int i = 0;
+
+	{
+		uint64_t last_flags = AMDGPU_PTE_VALID;
+		uint64_t last_addr = vm->root.base.bo->tbo.offset;
+
+		amdgpu_gmc_get_vm_pde(adev, adev->vm_manager.root_level,
+				      &last_addr, &last_flags);
+		last_pde = last_addr | last_flags;
+	}
+
+	amdgpu_vm_pt_start(adev, vm, addr >> PAGE_SHIFT, &cursor);
+
+	do {
+		unsigned int mask, shift, idx;
+		struct amdgpu_bo *bo;
+		uint64_t pte;
+
+		mask = amdgpu_vm_entries_mask(adev, cursor.level);
+		shift = amdgpu_vm_level_shift(adev, cursor.level);
+		idx = (cursor.pfn >> shift) & mask;
+
+		bo = cursor.entry->base.bo;
+		if (bo) {
+			uint64_t this_flags = AMDGPU_PTE_VALID;
+			uint64_t this_addr = bo->tbo.offset;
+
+			/* Flag discrepancy between previous level PDE
+			 * and the actual address of this PTB or PDB.
+			 */
+			amdgpu_gmc_get_vm_pde(adev, cursor.level,
+					      &this_addr, &this_flags);
+			if ((this_addr | this_flags) != last_pde)
+				i += snprintf(buf + i, sizeof(buf) - i, "!");
+
+			amdgpu_ttm_access_memory(&bo->tbo, idx * sizeof(pte),
+						 &pte, sizeof(pte), false);
+			i += snprintf(buf + i, sizeof(buf) - i,
+				      "%s[%d]=0x%llx ",
+				      level_entry[cursor.level], idx, pte);
+			last_pde = pte;
+		} else {
+			/* Flag discrepancy if previous level PDE had
+			 * a valid entry but there is no PTB or PDB BO.
+			 */
+			if ((last_pde & AMDGPU_PTE_VALID) &&
+			    !(last_pde & AMDGPU_PDE_PTE))
+				i += snprintf(buf + i, sizeof(buf) - i, "!");
+			i += snprintf(buf + i, sizeof(buf) - i,
+				      "no %s ", level_block[cursor.level]);
+			last_pde = 0;
+		}
+
+		++cursor.level;
+		cursor.parent = cursor.entry;
+		if (!cursor.entry->entries)
+			break;
+		cursor.entry = &cursor.entry->entries[idx];
+	} while (cursor.entry);
+	dev_err(adev->dev, "%s", buf);
+}
+
 /**
  * amdgpu_vm_get_pd_bo - add the VM PD to a validation list
  *
@@ -3081,8 +3161,9 @@ int amdgpu_vm_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
  * @pasid: PASID identifier for VM
  * @task_info: task_info to fill.
  */
-void amdgpu_vm_get_task_info(struct amdgpu_device *adev, unsigned int pasid,
-			 struct amdgpu_task_info *task_info)
+struct amdgpu_vm *amdgpu_vm_get_task_info(struct amdgpu_device *adev,
+					  unsigned int pasid,
+					  struct amdgpu_task_info *task_info)
 {
 	struct amdgpu_vm *vm;
 	unsigned long flags;
@@ -3094,6 +3175,8 @@ void amdgpu_vm_get_task_info(struct amdgpu_device *adev, unsigned int pasid,
 		*task_info = vm->task_info;
 
 	spin_unlock_irqrestore(&adev->vm_manager.pasid_lock, flags);
+
+	return vm;
 }
 
 /**
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
index 489a162ca620..6a8b833d180e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
@@ -348,6 +348,8 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,
 int amdgpu_vm_make_compute(struct amdgpu_device *adev, struct amdgpu_vm *vm, unsigned int pasid);
 void amdgpu_vm_release_compute(struct amdgpu_device *adev, struct amdgpu_vm *vm);
 void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm);
+void amdgpu_vm_dump_pte(struct amdgpu_device *adev, struct amdgpu_vm *vm,
+			uint64_t addr);
 void amdgpu_vm_get_pd_bo(struct amdgpu_vm *vm,
 			 struct list_head *validated,
 			 struct amdgpu_bo_list_entry *entry);
@@ -401,8 +403,9 @@ bool amdgpu_vm_need_pipeline_sync(struct amdgpu_ring *ring,
 				  struct amdgpu_job *job);
 void amdgpu_vm_check_compute_bug(struct amdgpu_device *adev);
 
-void amdgpu_vm_get_task_info(struct amdgpu_device *adev, unsigned int pasid,
-			     struct amdgpu_task_info *task_info);
+struct amdgpu_vm *amdgpu_vm_get_task_info(struct amdgpu_device *adev,
+					  unsigned int pasid,
+					  struct amdgpu_task_info *task_info);
 
 void amdgpu_vm_set_task_info(struct amdgpu_vm *vm);
 
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
index 8bf2ba310fd9..18207ecfd85c 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
@@ -1448,9 +1448,10 @@ static int gmc_v8_0_process_interrupt(struct amdgpu_device *adev,
 
 	if (printk_ratelimit()) {
 		struct amdgpu_task_info task_info;
+		struct amdgpu_vm *vm;
 
 		memset(&task_info, 0, sizeof(struct amdgpu_task_info));
-		amdgpu_vm_get_task_info(adev, entry->pasid, &task_info);
+		vm = amdgpu_vm_get_task_info(adev, entry->pasid, &task_info);
 
 		dev_err(adev->dev, "GPU fault detected: %d 0x%08x for process %s pid %d thread %s pid %d\n",
 			entry->src_id, entry->src_data[0], task_info.process_name,
@@ -1461,6 +1462,9 @@ static int gmc_v8_0_process_interrupt(struct amdgpu_device *adev,
 			status);
 		gmc_v8_0_vm_decode_fault(adev, status, addr, mc_client,
 					 entry->pasid);
+		if (vm)
+			amdgpu_vm_dump_pte(adev, vm, (uint64_t)addr
+					   << AMDGPU_GPU_PAGE_SHIFT);
 	}
 
 	vmid = REG_GET_FIELD(status, VM_CONTEXT1_PROTECTION_FAULT_STATUS,
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index bd5d36944481..f27e88af4016 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -331,9 +331,10 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
 
 	if (printk_ratelimit()) {
 		struct amdgpu_task_info task_info;
+		struct amdgpu_vm *vm;
 
 		memset(&task_info, 0, sizeof(struct amdgpu_task_info));
-		amdgpu_vm_get_task_info(adev, entry->pasid, &task_info);
+		vm = amdgpu_vm_get_task_info(adev, entry->pasid, &task_info);
 
 		dev_err(adev->dev,
 			"[%s] %s page fault (src_id:%u ring:%u vmid:%u "
@@ -349,6 +350,8 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
 			dev_err(adev->dev,
 				"VM_L2_PROTECTION_FAULT_STATUS:0x%08X\n",
 				status);
+		if (vm)
+			amdgpu_vm_dump_pte(adev, vm, addr);
 	}
 
 	return 0;
-- 
2.17.1