[PATCH 1/3] Revert "drm/amdkfd: Use partial mapping in GPU page fault recovery"

Philip Yang Philip.Yang at amd.com
Mon Oct 23 20:37:01 UTC 2023


This reverts commit c45c3bc930bf60e7658f87c519a40f77513b96aa.

Found KFDSVMEvict test regression on vega10, kernel BUG backtrace:

[  135.365083] amdgpu: Migration failed during eviction
[  135.365090] ------------[ cut here ]------------
[  135.365097] This was not the last reference
[  135.365122] WARNING: CPU: 5 PID: 1998 at
drivers/gpu/drm/amd/amdgpu/../amdkfd/kfd_svm.c:3515
svm_range_evict_svm_bo_worker+0x21c/0x390 [amdgpu]
[  135.365836]  svm_range_evict_svm_bo_worker+0x21c/0x390 [amdgpu]
[  135.366249]  process_one_work+0x298/0x590
[  135.366256]  worker_thread+0x3d/0x3d0
......
[  135.721257] kernel BUG at include/linux/swapops.h:472!
[  135.721537] Call Trace:
[  135.721540]  <TASK>
[  135.721592]  hmm_vma_walk_pmd+0x5c8/0x780
[  135.721598]  walk_pgd_range+0x3bc/0x7c0
[  135.721604]  __walk_page_range+0x1ec/0x200
[  135.721609]  walk_page_range+0x119/0x1a0
[  135.721613]  hmm_range_fault+0x5d/0xb0
[  135.721617]  amdgpu_hmm_range_get_pages+0x159/0x240 [amdgpu]
[  135.721820]  svm_range_validate_and_map+0x57f/0x16c0 [amdgpu]
[  135.722411]  svm_range_restore_pages+0xcd8/0x1150 [amdgpu]
[  135.722613]  amdgpu_vm_handle_fault+0xc2/0x360 [amdgpu]
[  135.722777]  gmc_v9_0_process_interrupt+0x255/0x670 [amdgpu]

Signed-off-by: Philip Yang <Philip.Yang at amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 35 +++++++++-------------------
 1 file changed, 11 insertions(+), 24 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index f2b33fb2afcf..4d000c63cde8 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -1565,7 +1565,6 @@ static void *kfd_svm_page_owner(struct kfd_process *p, int32_t gpuidx)
  * 5. Release page table (and SVM BO) reservation
  */
 static int svm_range_validate_and_map(struct mm_struct *mm,
-				      unsigned long map_start, unsigned long map_last,
 				      struct svm_range *prange, int32_t gpuidx,
 				      bool intr, bool wait, bool flush_tlb)
 {
@@ -1646,8 +1645,6 @@ static int svm_range_validate_and_map(struct mm_struct *mm,
 	end = (prange->last + 1) << PAGE_SHIFT;
 	for (addr = start; !r && addr < end; ) {
 		struct hmm_range *hmm_range;
-		unsigned long map_start_vma;
-		unsigned long map_last_vma;
 		struct vm_area_struct *vma;
 		uint64_t vram_pages_vma;
 		unsigned long next = 0;
@@ -1696,16 +1693,9 @@ static int svm_range_validate_and_map(struct mm_struct *mm,
 			r = -EAGAIN;
 		}
 
-		if (!r) {
-			map_start_vma = max(map_start, prange->start + offset);
-			map_last_vma = min(map_last, prange->start + offset + npages - 1);
-			if (map_start_vma <= map_last_vma) {
-				offset = map_start_vma - prange->start;
-				npages = map_last_vma - map_start_vma + 1;
-				r = svm_range_map_to_gpus(prange, offset, npages, readonly,
-							  ctx->bitmap, wait, flush_tlb);
-			}
-		}
+		if (!r)
+			r = svm_range_map_to_gpus(prange, offset, npages, readonly,
+						  ctx->bitmap, wait, flush_tlb);
 
 		if (!r && next == end)
 			prange->mapped_to_gpu = true;
@@ -1811,8 +1801,8 @@ static void svm_range_restore_work(struct work_struct *work)
 		 */
 		mutex_lock(&prange->migrate_mutex);
 
-		r = svm_range_validate_and_map(mm, prange->start, prange->last, prange,
-					       MAX_GPU_INSTANCE, false, true, false);
+		r = svm_range_validate_and_map(mm, prange, MAX_GPU_INSTANCE,
+					       false, true, false);
 		if (r)
 			pr_debug("failed %d to map 0x%lx to gpus\n", r,
 				 prange->start);
@@ -3026,8 +3016,6 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
 	kfd_smi_event_page_fault_start(node, p->lead_thread->pid, addr,
 				       write_fault, timestamp);
 
-	start = prange->start;
-	last = prange->last;
 	if (prange->actual_loc != 0 || best_loc != 0) {
 		migration = true;
 		/* Align migration range start and size to granularity size */
@@ -3061,11 +3049,10 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
 		}
 	}
 
-	r = svm_range_validate_and_map(mm, start, last, prange, gpuidx, false,
-				       false, false);
+	r = svm_range_validate_and_map(mm, prange, gpuidx, false, false, false);
 	if (r)
 		pr_debug("failed %d to map svms 0x%p [0x%lx 0x%lx] to gpus\n",
-			 r, svms, start, last);
+			 r, svms, prange->start, prange->last);
 
 	kfd_smi_event_page_fault_end(node, p->lead_thread->pid, addr,
 				     migration);
@@ -3611,8 +3598,8 @@ svm_range_set_attr(struct kfd_process *p, struct mm_struct *mm,
 
 		flush_tlb = !migrated && update_mapping && prange->mapped_to_gpu;
 
-		r = svm_range_validate_and_map(mm, prange->start, prange->last, prange,
-					       MAX_GPU_INSTANCE, true, true, flush_tlb);
+		r = svm_range_validate_and_map(mm, prange, MAX_GPU_INSTANCE,
+					       true, true, flush_tlb);
 		if (r)
 			pr_debug("failed %d to map svm range\n", r);
 
@@ -3626,8 +3613,8 @@ svm_range_set_attr(struct kfd_process *p, struct mm_struct *mm,
 		pr_debug("Remapping prange 0x%p [0x%lx 0x%lx]\n",
 			 prange, prange->start, prange->last);
 		mutex_lock(&prange->migrate_mutex);
-		r = svm_range_validate_and_map(mm,  prange->start, prange->last, prange,
-					       MAX_GPU_INSTANCE, true, true, prange->mapped_to_gpu);
+		r = svm_range_validate_and_map(mm, prange, MAX_GPU_INSTANCE,
+					       true, true, prange->mapped_to_gpu);
 		if (r)
 			pr_debug("failed %d on remap svm range\n", r);
 		mutex_unlock(&prange->migrate_mutex);
-- 
2.35.1



More information about the amd-gfx mailing list