[PATCH] drm/amdkfd: Fix some issues at userptr buffer validation process.
Xiaogang.Chen
xiaogang.chen at amd.com
Wed Apr 12 06:14:32 UTC 2023
From: Xiaogang Chen <xiaogang.chen at amd.com>
Notice userptr buffer restore process has following issues:
1: amdgpu_ttm_tt_get_user_pages can fail(-EFAULT). If it failed we should not set
it valid(mem->invalid = 0). In this case mem has no associated hmm range or user_pages
associated.
2: mmu notifier can happen concurrently and update mem->range->notifier->invalidate_seq,
but not mem->range->notifier_seq. That causes mem->range->notifier_seq stale
when mem is in process_info->userptr_inval_list and amdgpu_amdkfd_restore_userptr_worker
got interrupted. At next rescheduled next attempt we use stale mem->range->notifier_seq
to compare with mem->range->notifier->invalidate_seq.
Signed-off-by: Xiaogang Chen <Xiaogang.Chen at amd.com>
---
.../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 45 +++++++++++++++----
1 file changed, 37 insertions(+), 8 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 7b1f5933ebaa..6881f1b0844c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -2444,7 +2444,9 @@ static int update_invalid_user_pages(struct amdkfd_process_info *process_info,
ret = -EAGAIN;
goto unlock_out;
}
- mem->invalid = 0;
+ /* set mem valid if mem has hmm range associated */
+ if (mem->range)
+ mem->invalid = 0;
}
unlock_out:
@@ -2576,16 +2578,28 @@ static int confirm_valid_user_pages_locked(struct amdkfd_process_info *process_i
list_for_each_entry_safe(mem, tmp_mem,
&process_info->userptr_inval_list,
validate_list.head) {
- bool valid = amdgpu_ttm_tt_get_user_pages_done(
- mem->bo->tbo.ttm, mem->range);
+ /* Only check mem with hmm range associated */
+ bool valid;
- mem->range = NULL;
- if (!valid) {
- WARN(!mem->invalid, "Invalid BO not marked invalid");
+ if (mem->range) {
+ valid = amdgpu_ttm_tt_get_user_pages_done(
+ mem->bo->tbo.ttm, mem->range);
+
+ mem->range = NULL;
+ if (!valid) {
+ WARN(!mem->invalid, "Invalid BO not marked invalid");
+ ret = -EAGAIN;
+ continue;
+ }
+ } else
+ /* keep mem without hmm range at userptr_inval_list */
+ continue;
+
+ if (mem->invalid) {
+ WARN(1, "Valid BO is marked invalid");
ret = -EAGAIN;
continue;
}
- WARN(mem->invalid, "Valid BO is marked invalid");
list_move_tail(&mem->validate_list.head,
&process_info->userptr_valid_list);
@@ -2644,8 +2658,23 @@ static void amdgpu_amdkfd_restore_userptr_worker(struct work_struct *work)
* reference counting inside KFD will handle this case.
*/
mutex_lock(&process_info->notifier_lock);
- if (process_info->evicted_bos != evicted_bos)
+ if (process_info->evicted_bos != evicted_bos) {
+ /* mmu notifier interrupted amdgpu_amdkfd_restore_userptr_worker
+ * before reschedule next attempt update stale mem->range->notifier_seq
+ * inside userptr_inval_list
+ */
+ struct kgd_mem *mem, *tmp_mem;
+
+ list_for_each_entry_safe(mem, tmp_mem,
+ &process_info->userptr_inval_list,
+ validate_list.head) {
+
+ if (mem->range)
+ mem->range->notifier_seq = mem->range->notifier->invalidate_seq;
+ }
+
goto unlock_notifier_out;
+ }
if (confirm_valid_user_pages_locked(process_info)) {
WARN(1, "User pages unexpectedly invalid");
--
2.25.1
More information about the amd-gfx
mailing list