[PATCH 7/8] drm/amdgpu: reset VM when an error is detected
Christian König
ckoenig.leichtzumerken at gmail.com
Thu Apr 20 11:57:51 UTC 2023
When some problem with the updates of page tables is detected reset the
state machine of the VM and re-create all page tables from scratch.
Signed-off-by: Christian König <christian.koenig at amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 81 +++++++++++++++++++++-----
1 file changed, 65 insertions(+), 16 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index fa8f48a0fa84..8082c9e006b1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -265,6 +265,32 @@ static void amdgpu_vm_bo_done(struct amdgpu_vm_bo_base *vm_bo)
spin_unlock(&vm_bo->vm->status_lock);
}
+/**
+ * amdgpu_vm_bo_reset_state_machine - reset the vm_bo state machine
+ * @vm: the VM which state machine to reset
+ *
+ * Move all vm_bo object in the VM into a state where they will be updated
+ * again during validation.
+ */
+static void amdgpu_vm_bo_reset_state_machine(struct amdgpu_vm *vm)
+{
+ struct amdgpu_vm_bo_base *vm_bo, *tmp;
+
+ spin_lock(&vm->status_lock);
+ list_splice_init(&vm->done, &vm->invalidated);
+ list_for_each_entry(vm_bo, &vm->invalidated, vm_status)
+ vm_bo->moved = true;
+ list_for_each_entry_safe(vm_bo, tmp, &vm->idle, vm_status) {
+ struct amdgpu_bo *bo = vm_bo->bo;
+
+ if (!bo || bo->tbo.type != ttm_bo_type_kernel)
+ list_move(&vm_bo->vm_status, &vm_bo->vm->moved);
+ else if (bo->parent)
+ list_move(&vm_bo->vm_status, &vm_bo->vm->relocated);
+ }
+ spin_unlock(&vm->status_lock);
+}
+
/**
* amdgpu_vm_bo_base_init - Adds bo to the list of bos associated with the vm
*
@@ -350,6 +376,34 @@ void amdgpu_vm_move_to_lru_tail(struct amdgpu_device *adev,
spin_unlock(&adev->mman.bdev.lru_lock);
}
+/* Create scheduler entities for page table updates */
+static int amdgpu_vm_init_entities(struct amdgpu_device *adev,
+ struct amdgpu_vm *vm)
+{
+ int r;
+
+ r = drm_sched_entity_init(&vm->immediate, DRM_SCHED_PRIORITY_NORMAL,
+ adev->vm_manager.vm_pte_scheds,
+ adev->vm_manager.vm_pte_num_scheds, NULL);
+ if (r)
+ goto error;
+
+ return drm_sched_entity_init(&vm->delayed, DRM_SCHED_PRIORITY_NORMAL,
+ adev->vm_manager.vm_pte_scheds,
+ adev->vm_manager.vm_pte_num_scheds, NULL);
+
+error:
+ drm_sched_entity_destroy(&vm->immediate);
+ return r;
+}
+
+/* Destroy the entities for page table updates again */
+static void amdgpu_vm_fini_entities(struct amdgpu_vm *vm)
+{
+ drm_sched_entity_destroy(&vm->immediate);
+ drm_sched_entity_destroy(&vm->delayed);
+}
+
/**
* amdgpu_vm_validate_pt_bos - validate the page table BOs
*
@@ -372,6 +426,14 @@ int amdgpu_vm_validate_pt_bos(struct amdgpu_device *adev, struct amdgpu_vm *vm,
struct amdgpu_bo *bo;
int r;
+ if (drm_sched_entity_error(&vm->delayed)) {
+ amdgpu_vm_bo_reset_state_machine(vm);
+ amdgpu_vm_fini_entities(vm);
+ r = amdgpu_vm_init_entities(adev, vm);
+ if (r)
+ return r;
+ }
+
spin_lock(&vm->status_lock);
while (!list_empty(&vm->evicted)) {
bo_base = list_first_entry(&vm->evicted,
@@ -2037,19 +2099,10 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm)
INIT_LIST_HEAD(&vm->pt_freed);
INIT_WORK(&vm->pt_free_work, amdgpu_vm_pt_free_work);
- /* create scheduler entities for page table updates */
- r = drm_sched_entity_init(&vm->immediate, DRM_SCHED_PRIORITY_NORMAL,
- adev->vm_manager.vm_pte_scheds,
- adev->vm_manager.vm_pte_num_scheds, NULL);
+ r = amdgpu_vm_init_entities(adev, vm);
if (r)
return r;
- r = drm_sched_entity_init(&vm->delayed, DRM_SCHED_PRIORITY_NORMAL,
- adev->vm_manager.vm_pte_scheds,
- adev->vm_manager.vm_pte_num_scheds, NULL);
- if (r)
- goto error_free_immediate;
-
vm->pte_support_ats = false;
vm->is_compute_context = false;
@@ -2109,10 +2162,7 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm)
error_free_delayed:
dma_fence_put(vm->last_tlb_flush);
dma_fence_put(vm->last_unlocked);
- drm_sched_entity_destroy(&vm->delayed);
-
-error_free_immediate:
- drm_sched_entity_destroy(&vm->immediate);
+ amdgpu_vm_fini_entities(vm);
return r;
}
@@ -2265,8 +2315,7 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm)
amdgpu_bo_unref(&root);
WARN_ON(vm->root.bo);
- drm_sched_entity_destroy(&vm->immediate);
- drm_sched_entity_destroy(&vm->delayed);
+ amdgpu_vm_fini_entities(vm);
if (!RB_EMPTY_ROOT(&vm->va.rb_root)) {
dev_err(adev->dev, "still active bo inside vm\n");
--
2.34.1
More information about the amd-gfx
mailing list