[PATCH 3/3] drm/amdgpu: nuke the VM PD/PT shadow handling
Lazar, Lijo
lijo.lazar at amd.com
Wed Aug 28 04:48:53 UTC 2024
On 8/27/2024 7:42 PM, Christian König wrote:
> This was only used as workaround for recovering the page tables after
> VRAM was lost and is no longer necessary after the function
> amdgpu_vm_bo_reset_state_machine() started to do the same.
>
> Compute never used shadows either, so the only proplematic case left is
> SVM and that is most likely not recoverable in any way when VRAM is
> lost.
>
> Signed-off-by: Christian König <christian.koenig at amd.com>
This patch works fine on GC 9.4.3 SOCs.
Acked-by: Lijo Lazar <lijo.lazar at amd.com>
Alex or someone else may take a closer look.
Thanks,
Lijo
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 4 -
> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 87 +--------------------
> drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 67 +---------------
> drivers/gpu/drm/amd/amdgpu/amdgpu_object.h | 21 -----
> drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 17 ----
> drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c | 56 +------------
> drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c | 19 +----
> 7 files changed, 6 insertions(+), 265 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index e8c284aea1f2..e2cf77a93a0f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -1082,10 +1082,6 @@ struct amdgpu_device {
>
> struct amdgpu_virt virt;
>
> - /* link all shadow bo */
> - struct list_head shadow_list;
> - struct mutex shadow_list_lock;
> -
> /* record hw reset is performed */
> bool has_hw_reset;
> u8 reset_magic[AMDGPU_RESET_MAGIC_NUM];
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index da06705f0026..33a939571f89 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -4107,9 +4107,6 @@ int amdgpu_device_init(struct amdgpu_device *adev,
> spin_lock_init(&adev->mm_stats.lock);
> spin_lock_init(&adev->wb.lock);
>
> - INIT_LIST_HEAD(&adev->shadow_list);
> - mutex_init(&adev->shadow_list_lock);
> -
> INIT_LIST_HEAD(&adev->reset_list);
>
> INIT_LIST_HEAD(&adev->ras_list);
> @@ -5029,80 +5026,6 @@ static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev)
> return 0;
> }
>
> -/**
> - * amdgpu_device_recover_vram - Recover some VRAM contents
> - *
> - * @adev: amdgpu_device pointer
> - *
> - * Restores the contents of VRAM buffers from the shadows in GTT. Used to
> - * restore things like GPUVM page tables after a GPU reset where
> - * the contents of VRAM might be lost.
> - *
> - * Returns:
> - * 0 on success, negative error code on failure.
> - */
> -static int amdgpu_device_recover_vram(struct amdgpu_device *adev)
> -{
> - struct dma_fence *fence = NULL, *next = NULL;
> - struct amdgpu_bo *shadow;
> - struct amdgpu_bo_vm *vmbo;
> - long r = 1, tmo;
> -
> - if (amdgpu_sriov_runtime(adev))
> - tmo = msecs_to_jiffies(8000);
> - else
> - tmo = msecs_to_jiffies(100);
> -
> - dev_info(adev->dev, "recover vram bo from shadow start\n");
> - mutex_lock(&adev->shadow_list_lock);
> - list_for_each_entry(vmbo, &adev->shadow_list, shadow_list) {
> - /* If vm is compute context or adev is APU, shadow will be NULL */
> - if (!vmbo->shadow)
> - continue;
> - shadow = vmbo->shadow;
> -
> - /* No need to recover an evicted BO */
> - if (!shadow->tbo.resource ||
> - shadow->tbo.resource->mem_type != TTM_PL_TT ||
> - shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET ||
> - shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM)
> - continue;
> -
> - r = amdgpu_bo_restore_shadow(shadow, &next);
> - if (r)
> - break;
> -
> - if (fence) {
> - tmo = dma_fence_wait_timeout(fence, false, tmo);
> - dma_fence_put(fence);
> - fence = next;
> - if (tmo == 0) {
> - r = -ETIMEDOUT;
> - break;
> - } else if (tmo < 0) {
> - r = tmo;
> - break;
> - }
> - } else {
> - fence = next;
> - }
> - }
> - mutex_unlock(&adev->shadow_list_lock);
> -
> - if (fence)
> - tmo = dma_fence_wait_timeout(fence, false, tmo);
> - dma_fence_put(fence);
> -
> - if (r < 0 || tmo <= 0) {
> - dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo);
> - return -EIO;
> - }
> -
> - dev_info(adev->dev, "recover vram bo from shadow done\n");
> - return 0;
> -}
> -
> -
> /**
> * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
> *
> @@ -5165,12 +5088,8 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
> if (r)
> return r;
>
> - if (adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {
> + if (adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST)
> amdgpu_inc_vram_lost(adev);
> - r = amdgpu_device_recover_vram(adev);
> - }
> - if (r)
> - return r;
>
> /* need to be called during full access so we can't do it later like
> * bare-metal does.
> @@ -5569,9 +5488,7 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,
> }
> }
>
> - if (!r)
> - r = amdgpu_device_recover_vram(tmp_adev);
> - else
> + if (r)
> tmp_adev->asic_reset_res = r;
> }
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> index ff1a69243699..cc9506fb0cc7 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> @@ -76,24 +76,6 @@ static void amdgpu_bo_user_destroy(struct ttm_buffer_object *tbo)
> amdgpu_bo_destroy(tbo);
> }
>
> -static void amdgpu_bo_vm_destroy(struct ttm_buffer_object *tbo)
> -{
> - struct amdgpu_device *adev = amdgpu_ttm_adev(tbo->bdev);
> - struct amdgpu_bo *shadow_bo = ttm_to_amdgpu_bo(tbo), *bo;
> - struct amdgpu_bo_vm *vmbo;
> -
> - bo = shadow_bo->parent;
> - vmbo = to_amdgpu_bo_vm(bo);
> - /* in case amdgpu_device_recover_vram got NULL of bo->parent */
> - if (!list_empty(&vmbo->shadow_list)) {
> - mutex_lock(&adev->shadow_list_lock);
> - list_del_init(&vmbo->shadow_list);
> - mutex_unlock(&adev->shadow_list_lock);
> - }
> -
> - amdgpu_bo_destroy(tbo);
> -}
> -
> /**
> * amdgpu_bo_is_amdgpu_bo - check if the buffer object is an &amdgpu_bo
> * @bo: buffer object to be checked
> @@ -107,8 +89,7 @@ static void amdgpu_bo_vm_destroy(struct ttm_buffer_object *tbo)
> bool amdgpu_bo_is_amdgpu_bo(struct ttm_buffer_object *bo)
> {
> if (bo->destroy == &amdgpu_bo_destroy ||
> - bo->destroy == &amdgpu_bo_user_destroy ||
> - bo->destroy == &amdgpu_bo_vm_destroy)
> + bo->destroy == &amdgpu_bo_user_destroy)
> return true;
>
> return false;
> @@ -718,52 +699,6 @@ int amdgpu_bo_create_vm(struct amdgpu_device *adev,
> return r;
> }
>
> -/**
> - * amdgpu_bo_add_to_shadow_list - add a BO to the shadow list
> - *
> - * @vmbo: BO that will be inserted into the shadow list
> - *
> - * Insert a BO to the shadow list.
> - */
> -void amdgpu_bo_add_to_shadow_list(struct amdgpu_bo_vm *vmbo)
> -{
> - struct amdgpu_device *adev = amdgpu_ttm_adev(vmbo->bo.tbo.bdev);
> -
> - mutex_lock(&adev->shadow_list_lock);
> - list_add_tail(&vmbo->shadow_list, &adev->shadow_list);
> - vmbo->shadow->parent = amdgpu_bo_ref(&vmbo->bo);
> - vmbo->shadow->tbo.destroy = &amdgpu_bo_vm_destroy;
> - mutex_unlock(&adev->shadow_list_lock);
> -}
> -
> -/**
> - * amdgpu_bo_restore_shadow - restore an &amdgpu_bo shadow
> - *
> - * @shadow: &amdgpu_bo shadow to be restored
> - * @fence: dma_fence associated with the operation
> - *
> - * Copies a buffer object's shadow content back to the object.
> - * This is used for recovering a buffer from its shadow in case of a gpu
> - * reset where vram context may be lost.
> - *
> - * Returns:
> - * 0 for success or a negative error code on failure.
> - */
> -int amdgpu_bo_restore_shadow(struct amdgpu_bo *shadow, struct dma_fence **fence)
> -
> -{
> - struct amdgpu_device *adev = amdgpu_ttm_adev(shadow->tbo.bdev);
> - struct amdgpu_ring *ring = adev->mman.buffer_funcs_ring;
> - uint64_t shadow_addr, parent_addr;
> -
> - shadow_addr = amdgpu_bo_gpu_offset(shadow);
> - parent_addr = amdgpu_bo_gpu_offset(shadow->parent);
> -
> - return amdgpu_copy_buffer(ring, shadow_addr, parent_addr,
> - amdgpu_bo_size(shadow), NULL, fence,
> - true, false, 0);
> -}
> -
> /**
> * amdgpu_bo_kmap - map an &amdgpu_bo buffer object
> * @bo: &amdgpu_bo buffer object to be mapped
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
> index 44ffd9a03dce..717e47b46167 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
> @@ -136,8 +136,6 @@ struct amdgpu_bo_user {
>
> struct amdgpu_bo_vm {
> struct amdgpu_bo bo;
> - struct amdgpu_bo *shadow;
> - struct list_head shadow_list;
> struct amdgpu_vm_bo_base entries[];
> };
>
> @@ -275,22 +273,6 @@ static inline bool amdgpu_bo_encrypted(struct amdgpu_bo *bo)
> return bo->flags & AMDGPU_GEM_CREATE_ENCRYPTED;
> }
>
> -/**
> - * amdgpu_bo_shadowed - check if the BO is shadowed
> - *
> - * @bo: BO to be tested.
> - *
> - * Returns:
> - * NULL if not shadowed or else return a BO pointer.
> - */
> -static inline struct amdgpu_bo *amdgpu_bo_shadowed(struct amdgpu_bo *bo)
> -{
> - if (bo->tbo.type == ttm_bo_type_kernel)
> - return to_amdgpu_bo_vm(bo)->shadow;
> -
> - return NULL;
> -}
> -
> bool amdgpu_bo_is_amdgpu_bo(struct ttm_buffer_object *bo);
> void amdgpu_bo_placement_from_domain(struct amdgpu_bo *abo, u32 domain);
>
> @@ -347,9 +329,6 @@ u64 amdgpu_bo_gpu_offset(struct amdgpu_bo *bo);
> u64 amdgpu_bo_gpu_offset_no_check(struct amdgpu_bo *bo);
> void amdgpu_bo_get_memory(struct amdgpu_bo *bo,
> struct amdgpu_mem_stats *stats);
> -void amdgpu_bo_add_to_shadow_list(struct amdgpu_bo_vm *vmbo);
> -int amdgpu_bo_restore_shadow(struct amdgpu_bo *shadow,
> - struct dma_fence **fence);
> uint32_t amdgpu_bo_get_preferred_domain(struct amdgpu_device *adev,
> uint32_t domain);
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> index ad2e469548c9..3464a7a880f0 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> @@ -465,7 +465,6 @@ int amdgpu_vm_validate(struct amdgpu_device *adev, struct amdgpu_vm *vm,
> {
> uint64_t new_vm_generation = amdgpu_vm_generation(adev, vm);
> struct amdgpu_vm_bo_base *bo_base;
> - struct amdgpu_bo *shadow;
> struct amdgpu_bo *bo;
> int r;
>
> @@ -486,16 +485,10 @@ int amdgpu_vm_validate(struct amdgpu_device *adev, struct amdgpu_vm *vm,
> spin_unlock(&vm->status_lock);
>
> bo = bo_base->bo;
> - shadow = amdgpu_bo_shadowed(bo);
>
> r = validate(param, bo);
> if (r)
> return r;
> - if (shadow) {
> - r = validate(param, shadow);
> - if (r)
> - return r;
> - }
>
> if (bo->tbo.type != ttm_bo_type_kernel) {
> amdgpu_vm_bo_moved(bo_base);
> @@ -2129,10 +2122,6 @@ void amdgpu_vm_bo_invalidate(struct amdgpu_device *adev,
> {
> struct amdgpu_vm_bo_base *bo_base;
>
> - /* shadow bo doesn't have bo base, its validation needs its parent */
> - if (bo->parent && (amdgpu_bo_shadowed(bo->parent) == bo))
> - bo = bo->parent;
> -
> for (bo_base = bo->vm_bo; bo_base; bo_base = bo_base->next) {
> struct amdgpu_vm *vm = bo_base->vm;
>
> @@ -2466,7 +2455,6 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,
> root_bo = amdgpu_bo_ref(&root->bo);
> r = amdgpu_bo_reserve(root_bo, true);
> if (r) {
> - amdgpu_bo_unref(&root->shadow);
> amdgpu_bo_unref(&root_bo);
> goto error_free_delayed;
> }
> @@ -2558,11 +2546,6 @@ int amdgpu_vm_make_compute(struct amdgpu_device *adev, struct amdgpu_vm *vm)
> vm->last_update = dma_fence_get_stub();
> vm->is_compute_context = true;
>
> - /* Free the shadow bo for compute VM */
> - amdgpu_bo_unref(&to_amdgpu_bo_vm(vm->root.bo)->shadow);
> -
> - goto unreserve_bo;
> -
> unreserve_bo:
> amdgpu_bo_unreserve(vm->root.bo);
> return r;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
> index e39d6e7643bf..c8e0b8cfd336 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
> @@ -383,14 +383,6 @@ int amdgpu_vm_pt_clear(struct amdgpu_device *adev, struct amdgpu_vm *vm,
> if (r)
> return r;
>
> - if (vmbo->shadow) {
> - struct amdgpu_bo *shadow = vmbo->shadow;
> -
> - r = ttm_bo_validate(&shadow->tbo, &shadow->placement, &ctx);
> - if (r)
> - return r;
> - }
> -
> if (!drm_dev_enter(adev_to_drm(adev), &idx))
> return -ENODEV;
>
> @@ -448,10 +440,7 @@ int amdgpu_vm_pt_create(struct amdgpu_device *adev, struct amdgpu_vm *vm,
> int32_t xcp_id)
> {
> struct amdgpu_bo_param bp;
> - struct amdgpu_bo *bo;
> - struct dma_resv *resv;
> unsigned int num_entries;
> - int r;
>
> memset(&bp, 0, sizeof(bp));
>
> @@ -484,42 +473,7 @@ int amdgpu_vm_pt_create(struct amdgpu_device *adev, struct amdgpu_vm *vm,
> if (vm->root.bo)
> bp.resv = vm->root.bo->tbo.base.resv;
>
> - r = amdgpu_bo_create_vm(adev, &bp, vmbo);
> - if (r)
> - return r;
> -
> - bo = &(*vmbo)->bo;
> - if (vm->is_compute_context || (adev->flags & AMD_IS_APU)) {
> - (*vmbo)->shadow = NULL;
> - return 0;
> - }
> -
> - if (!bp.resv)
> - WARN_ON(dma_resv_lock(bo->tbo.base.resv,
> - NULL));
> - resv = bp.resv;
> - memset(&bp, 0, sizeof(bp));
> - bp.size = amdgpu_vm_pt_size(adev, level);
> - bp.domain = AMDGPU_GEM_DOMAIN_GTT;
> - bp.flags = AMDGPU_GEM_CREATE_CPU_GTT_USWC;
> - bp.type = ttm_bo_type_kernel;
> - bp.resv = bo->tbo.base.resv;
> - bp.bo_ptr_size = sizeof(struct amdgpu_bo);
> - bp.xcp_id_plus1 = xcp_id + 1;
> -
> - r = amdgpu_bo_create(adev, &bp, &(*vmbo)->shadow);
> -
> - if (!resv)
> - dma_resv_unlock(bo->tbo.base.resv);
> -
> - if (r) {
> - amdgpu_bo_unref(&bo);
> - return r;
> - }
> -
> - amdgpu_bo_add_to_shadow_list(*vmbo);
> -
> - return 0;
> + return amdgpu_bo_create_vm(adev, &bp, vmbo);
> }
>
> /**
> @@ -569,7 +523,6 @@ static int amdgpu_vm_pt_alloc(struct amdgpu_device *adev,
> return 0;
>
> error_free_pt:
> - amdgpu_bo_unref(&pt->shadow);
> amdgpu_bo_unref(&pt_bo);
> return r;
> }
> @@ -581,17 +534,10 @@ static int amdgpu_vm_pt_alloc(struct amdgpu_device *adev,
> */
> static void amdgpu_vm_pt_free(struct amdgpu_vm_bo_base *entry)
> {
> - struct amdgpu_bo *shadow;
> -
> if (!entry->bo)
> return;
>
> entry->bo->vm_bo = NULL;
> - shadow = amdgpu_bo_shadowed(entry->bo);
> - if (shadow) {
> - ttm_bo_set_bulk_move(&shadow->tbo, NULL);
> - amdgpu_bo_unref(&shadow);
> - }
> ttm_bo_set_bulk_move(&entry->bo->tbo, NULL);
>
> spin_lock(&entry->vm->status_lock);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c
> index 9b748d7058b5..390432a22ddd 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c
> @@ -35,16 +35,7 @@
> */
> static int amdgpu_vm_sdma_map_table(struct amdgpu_bo_vm *table)
> {
> - int r;
> -
> - r = amdgpu_ttm_alloc_gart(&table->bo.tbo);
> - if (r)
> - return r;
> -
> - if (table->shadow)
> - r = amdgpu_ttm_alloc_gart(&table->shadow->tbo);
> -
> - return r;
> + return amdgpu_ttm_alloc_gart(&table->bo.tbo);
> }
>
> /* Allocate a new job for @count PTE updates */
> @@ -273,17 +264,13 @@ static int amdgpu_vm_sdma_update(struct amdgpu_vm_update_params *p,
>
> if (!p->pages_addr) {
> /* set page commands needed */
> - if (vmbo->shadow)
> - amdgpu_vm_sdma_set_ptes(p, vmbo->shadow, pe, addr,
> - count, incr, flags);
> amdgpu_vm_sdma_set_ptes(p, bo, pe, addr, count,
> incr, flags);
> return 0;
> }
>
> /* copy commands needed */
> - ndw -= p->adev->vm_manager.vm_pte_funcs->copy_pte_num_dw *
> - (vmbo->shadow ? 2 : 1);
> + ndw -= p->adev->vm_manager.vm_pte_funcs->copy_pte_num_dw;
>
> /* for padding */
> ndw -= 7;
> @@ -298,8 +285,6 @@ static int amdgpu_vm_sdma_update(struct amdgpu_vm_update_params *p,
> pte[i] |= flags;
> }
>
> - if (vmbo->shadow)
> - amdgpu_vm_sdma_copy_ptes(p, vmbo->shadow, pe, nptes);
> amdgpu_vm_sdma_copy_ptes(p, bo, pe, nptes);
>
> pe += nptes * 8;
More information about the amd-gfx
mailing list