[PATCH 3/3] drm/amdgpu: nuke the VM PD/PT shadow handling

Lazar, Lijo lijo.lazar at amd.com
Wed Aug 28 04:48:53 UTC 2024



On 8/27/2024 7:42 PM, Christian König wrote:
> This was only used as workaround for recovering the page tables after
> VRAM was lost and is no longer necessary after the function
> amdgpu_vm_bo_reset_state_machine() started to do the same.
> 
> Compute never used shadows either, so the only proplematic case left is
> SVM and that is most likely not recoverable in any way when VRAM is
> lost.
> 
> Signed-off-by: Christian König <christian.koenig at amd.com>

This patch works fine on GC 9.4.3 SOCs.

Acked-by: Lijo Lazar <lijo.lazar at amd.com>

Alex or someone else may take a closer look.

Thanks,
Lijo

> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu.h         |  4 -
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c  | 87 +--------------------
>  drivers/gpu/drm/amd/amdgpu/amdgpu_object.c  | 67 +---------------
>  drivers/gpu/drm/amd/amdgpu/amdgpu_object.h  | 21 -----
>  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c      | 17 ----
>  drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c   | 56 +------------
>  drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c | 19 +----
>  7 files changed, 6 insertions(+), 265 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index e8c284aea1f2..e2cf77a93a0f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -1082,10 +1082,6 @@ struct amdgpu_device {
>  
>  	struct amdgpu_virt	virt;
>  
> -	/* link all shadow bo */
> -	struct list_head                shadow_list;
> -	struct mutex                    shadow_list_lock;
> -
>  	/* record hw reset is performed */
>  	bool has_hw_reset;
>  	u8				reset_magic[AMDGPU_RESET_MAGIC_NUM];
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index da06705f0026..33a939571f89 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -4107,9 +4107,6 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>  	spin_lock_init(&adev->mm_stats.lock);
>  	spin_lock_init(&adev->wb.lock);
>  
> -	INIT_LIST_HEAD(&adev->shadow_list);
> -	mutex_init(&adev->shadow_list_lock);
> -
>  	INIT_LIST_HEAD(&adev->reset_list);
>  
>  	INIT_LIST_HEAD(&adev->ras_list);
> @@ -5029,80 +5026,6 @@ static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev)
>  	return 0;
>  }
>  
> -/**
> - * amdgpu_device_recover_vram - Recover some VRAM contents
> - *
> - * @adev: amdgpu_device pointer
> - *
> - * Restores the contents of VRAM buffers from the shadows in GTT.  Used to
> - * restore things like GPUVM page tables after a GPU reset where
> - * the contents of VRAM might be lost.
> - *
> - * Returns:
> - * 0 on success, negative error code on failure.
> - */
> -static int amdgpu_device_recover_vram(struct amdgpu_device *adev)
> -{
> -	struct dma_fence *fence = NULL, *next = NULL;
> -	struct amdgpu_bo *shadow;
> -	struct amdgpu_bo_vm *vmbo;
> -	long r = 1, tmo;
> -
> -	if (amdgpu_sriov_runtime(adev))
> -		tmo = msecs_to_jiffies(8000);
> -	else
> -		tmo = msecs_to_jiffies(100);
> -
> -	dev_info(adev->dev, "recover vram bo from shadow start\n");
> -	mutex_lock(&adev->shadow_list_lock);
> -	list_for_each_entry(vmbo, &adev->shadow_list, shadow_list) {
> -		/* If vm is compute context or adev is APU, shadow will be NULL */
> -		if (!vmbo->shadow)
> -			continue;
> -		shadow = vmbo->shadow;
> -
> -		/* No need to recover an evicted BO */
> -		if (!shadow->tbo.resource ||
> -		    shadow->tbo.resource->mem_type != TTM_PL_TT ||
> -		    shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET ||
> -		    shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM)
> -			continue;
> -
> -		r = amdgpu_bo_restore_shadow(shadow, &next);
> -		if (r)
> -			break;
> -
> -		if (fence) {
> -			tmo = dma_fence_wait_timeout(fence, false, tmo);
> -			dma_fence_put(fence);
> -			fence = next;
> -			if (tmo == 0) {
> -				r = -ETIMEDOUT;
> -				break;
> -			} else if (tmo < 0) {
> -				r = tmo;
> -				break;
> -			}
> -		} else {
> -			fence = next;
> -		}
> -	}
> -	mutex_unlock(&adev->shadow_list_lock);
> -
> -	if (fence)
> -		tmo = dma_fence_wait_timeout(fence, false, tmo);
> -	dma_fence_put(fence);
> -
> -	if (r < 0 || tmo <= 0) {
> -		dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo);
> -		return -EIO;
> -	}
> -
> -	dev_info(adev->dev, "recover vram bo from shadow done\n");
> -	return 0;
> -}
> -
> -
>  /**
>   * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
>   *
> @@ -5165,12 +5088,8 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
>  	if (r)
>  		return r;
>  
> -	if (adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {
> +	if (adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST)
>  		amdgpu_inc_vram_lost(adev);
> -		r = amdgpu_device_recover_vram(adev);
> -	}
> -	if (r)
> -		return r;
>  
>  	/* need to be called during full access so we can't do it later like
>  	 * bare-metal does.
> @@ -5569,9 +5488,7 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,
>  			}
>  		}
>  
> -		if (!r)
> -			r = amdgpu_device_recover_vram(tmp_adev);
> -		else
> +		if (r)
>  			tmp_adev->asic_reset_res = r;
>  	}
>  
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> index ff1a69243699..cc9506fb0cc7 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> @@ -76,24 +76,6 @@ static void amdgpu_bo_user_destroy(struct ttm_buffer_object *tbo)
>  	amdgpu_bo_destroy(tbo);
>  }
>  
> -static void amdgpu_bo_vm_destroy(struct ttm_buffer_object *tbo)
> -{
> -	struct amdgpu_device *adev = amdgpu_ttm_adev(tbo->bdev);
> -	struct amdgpu_bo *shadow_bo = ttm_to_amdgpu_bo(tbo), *bo;
> -	struct amdgpu_bo_vm *vmbo;
> -
> -	bo = shadow_bo->parent;
> -	vmbo = to_amdgpu_bo_vm(bo);
> -	/* in case amdgpu_device_recover_vram got NULL of bo->parent */
> -	if (!list_empty(&vmbo->shadow_list)) {
> -		mutex_lock(&adev->shadow_list_lock);
> -		list_del_init(&vmbo->shadow_list);
> -		mutex_unlock(&adev->shadow_list_lock);
> -	}
> -
> -	amdgpu_bo_destroy(tbo);
> -}
> -
>  /**
>   * amdgpu_bo_is_amdgpu_bo - check if the buffer object is an &amdgpu_bo
>   * @bo: buffer object to be checked
> @@ -107,8 +89,7 @@ static void amdgpu_bo_vm_destroy(struct ttm_buffer_object *tbo)
>  bool amdgpu_bo_is_amdgpu_bo(struct ttm_buffer_object *bo)
>  {
>  	if (bo->destroy == &amdgpu_bo_destroy ||
> -	    bo->destroy == &amdgpu_bo_user_destroy ||
> -	    bo->destroy == &amdgpu_bo_vm_destroy)
> +	    bo->destroy == &amdgpu_bo_user_destroy)
>  		return true;
>  
>  	return false;
> @@ -718,52 +699,6 @@ int amdgpu_bo_create_vm(struct amdgpu_device *adev,
>  	return r;
>  }
>  
> -/**
> - * amdgpu_bo_add_to_shadow_list - add a BO to the shadow list
> - *
> - * @vmbo: BO that will be inserted into the shadow list
> - *
> - * Insert a BO to the shadow list.
> - */
> -void amdgpu_bo_add_to_shadow_list(struct amdgpu_bo_vm *vmbo)
> -{
> -	struct amdgpu_device *adev = amdgpu_ttm_adev(vmbo->bo.tbo.bdev);
> -
> -	mutex_lock(&adev->shadow_list_lock);
> -	list_add_tail(&vmbo->shadow_list, &adev->shadow_list);
> -	vmbo->shadow->parent = amdgpu_bo_ref(&vmbo->bo);
> -	vmbo->shadow->tbo.destroy = &amdgpu_bo_vm_destroy;
> -	mutex_unlock(&adev->shadow_list_lock);
> -}
> -
> -/**
> - * amdgpu_bo_restore_shadow - restore an &amdgpu_bo shadow
> - *
> - * @shadow: &amdgpu_bo shadow to be restored
> - * @fence: dma_fence associated with the operation
> - *
> - * Copies a buffer object's shadow content back to the object.
> - * This is used for recovering a buffer from its shadow in case of a gpu
> - * reset where vram context may be lost.
> - *
> - * Returns:
> - * 0 for success or a negative error code on failure.
> - */
> -int amdgpu_bo_restore_shadow(struct amdgpu_bo *shadow, struct dma_fence **fence)
> -
> -{
> -	struct amdgpu_device *adev = amdgpu_ttm_adev(shadow->tbo.bdev);
> -	struct amdgpu_ring *ring = adev->mman.buffer_funcs_ring;
> -	uint64_t shadow_addr, parent_addr;
> -
> -	shadow_addr = amdgpu_bo_gpu_offset(shadow);
> -	parent_addr = amdgpu_bo_gpu_offset(shadow->parent);
> -
> -	return amdgpu_copy_buffer(ring, shadow_addr, parent_addr,
> -				  amdgpu_bo_size(shadow), NULL, fence,
> -				  true, false, 0);
> -}
> -
>  /**
>   * amdgpu_bo_kmap - map an &amdgpu_bo buffer object
>   * @bo: &amdgpu_bo buffer object to be mapped
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
> index 44ffd9a03dce..717e47b46167 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
> @@ -136,8 +136,6 @@ struct amdgpu_bo_user {
>  
>  struct amdgpu_bo_vm {
>  	struct amdgpu_bo		bo;
> -	struct amdgpu_bo		*shadow;
> -	struct list_head		shadow_list;
>  	struct amdgpu_vm_bo_base        entries[];
>  };
>  
> @@ -275,22 +273,6 @@ static inline bool amdgpu_bo_encrypted(struct amdgpu_bo *bo)
>  	return bo->flags & AMDGPU_GEM_CREATE_ENCRYPTED;
>  }
>  
> -/**
> - * amdgpu_bo_shadowed - check if the BO is shadowed
> - *
> - * @bo: BO to be tested.
> - *
> - * Returns:
> - * NULL if not shadowed or else return a BO pointer.
> - */
> -static inline struct amdgpu_bo *amdgpu_bo_shadowed(struct amdgpu_bo *bo)
> -{
> -	if (bo->tbo.type == ttm_bo_type_kernel)
> -		return to_amdgpu_bo_vm(bo)->shadow;
> -
> -	return NULL;
> -}
> -
>  bool amdgpu_bo_is_amdgpu_bo(struct ttm_buffer_object *bo);
>  void amdgpu_bo_placement_from_domain(struct amdgpu_bo *abo, u32 domain);
>  
> @@ -347,9 +329,6 @@ u64 amdgpu_bo_gpu_offset(struct amdgpu_bo *bo);
>  u64 amdgpu_bo_gpu_offset_no_check(struct amdgpu_bo *bo);
>  void amdgpu_bo_get_memory(struct amdgpu_bo *bo,
>  			  struct amdgpu_mem_stats *stats);
> -void amdgpu_bo_add_to_shadow_list(struct amdgpu_bo_vm *vmbo);
> -int amdgpu_bo_restore_shadow(struct amdgpu_bo *shadow,
> -			     struct dma_fence **fence);
>  uint32_t amdgpu_bo_get_preferred_domain(struct amdgpu_device *adev,
>  					    uint32_t domain);
>  
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> index ad2e469548c9..3464a7a880f0 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> @@ -465,7 +465,6 @@ int amdgpu_vm_validate(struct amdgpu_device *adev, struct amdgpu_vm *vm,
>  {
>  	uint64_t new_vm_generation = amdgpu_vm_generation(adev, vm);
>  	struct amdgpu_vm_bo_base *bo_base;
> -	struct amdgpu_bo *shadow;
>  	struct amdgpu_bo *bo;
>  	int r;
>  
> @@ -486,16 +485,10 @@ int amdgpu_vm_validate(struct amdgpu_device *adev, struct amdgpu_vm *vm,
>  		spin_unlock(&vm->status_lock);
>  
>  		bo = bo_base->bo;
> -		shadow = amdgpu_bo_shadowed(bo);
>  
>  		r = validate(param, bo);
>  		if (r)
>  			return r;
> -		if (shadow) {
> -			r = validate(param, shadow);
> -			if (r)
> -				return r;
> -		}
>  
>  		if (bo->tbo.type != ttm_bo_type_kernel) {
>  			amdgpu_vm_bo_moved(bo_base);
> @@ -2129,10 +2122,6 @@ void amdgpu_vm_bo_invalidate(struct amdgpu_device *adev,
>  {
>  	struct amdgpu_vm_bo_base *bo_base;
>  
> -	/* shadow bo doesn't have bo base, its validation needs its parent */
> -	if (bo->parent && (amdgpu_bo_shadowed(bo->parent) == bo))
> -		bo = bo->parent;
> -
>  	for (bo_base = bo->vm_bo; bo_base; bo_base = bo_base->next) {
>  		struct amdgpu_vm *vm = bo_base->vm;
>  
> @@ -2466,7 +2455,6 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,
>  	root_bo = amdgpu_bo_ref(&root->bo);
>  	r = amdgpu_bo_reserve(root_bo, true);
>  	if (r) {
> -		amdgpu_bo_unref(&root->shadow);
>  		amdgpu_bo_unref(&root_bo);
>  		goto error_free_delayed;
>  	}
> @@ -2558,11 +2546,6 @@ int amdgpu_vm_make_compute(struct amdgpu_device *adev, struct amdgpu_vm *vm)
>  	vm->last_update = dma_fence_get_stub();
>  	vm->is_compute_context = true;
>  
> -	/* Free the shadow bo for compute VM */
> -	amdgpu_bo_unref(&to_amdgpu_bo_vm(vm->root.bo)->shadow);
> -
> -	goto unreserve_bo;
> -
>  unreserve_bo:
>  	amdgpu_bo_unreserve(vm->root.bo);
>  	return r;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
> index e39d6e7643bf..c8e0b8cfd336 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
> @@ -383,14 +383,6 @@ int amdgpu_vm_pt_clear(struct amdgpu_device *adev, struct amdgpu_vm *vm,
>  	if (r)
>  		return r;
>  
> -	if (vmbo->shadow) {
> -		struct amdgpu_bo *shadow = vmbo->shadow;
> -
> -		r = ttm_bo_validate(&shadow->tbo, &shadow->placement, &ctx);
> -		if (r)
> -			return r;
> -	}
> -
>  	if (!drm_dev_enter(adev_to_drm(adev), &idx))
>  		return -ENODEV;
>  
> @@ -448,10 +440,7 @@ int amdgpu_vm_pt_create(struct amdgpu_device *adev, struct amdgpu_vm *vm,
>  			int32_t xcp_id)
>  {
>  	struct amdgpu_bo_param bp;
> -	struct amdgpu_bo *bo;
> -	struct dma_resv *resv;
>  	unsigned int num_entries;
> -	int r;
>  
>  	memset(&bp, 0, sizeof(bp));
>  
> @@ -484,42 +473,7 @@ int amdgpu_vm_pt_create(struct amdgpu_device *adev, struct amdgpu_vm *vm,
>  	if (vm->root.bo)
>  		bp.resv = vm->root.bo->tbo.base.resv;
>  
> -	r = amdgpu_bo_create_vm(adev, &bp, vmbo);
> -	if (r)
> -		return r;
> -
> -	bo = &(*vmbo)->bo;
> -	if (vm->is_compute_context || (adev->flags & AMD_IS_APU)) {
> -		(*vmbo)->shadow = NULL;
> -		return 0;
> -	}
> -
> -	if (!bp.resv)
> -		WARN_ON(dma_resv_lock(bo->tbo.base.resv,
> -				      NULL));
> -	resv = bp.resv;
> -	memset(&bp, 0, sizeof(bp));
> -	bp.size = amdgpu_vm_pt_size(adev, level);
> -	bp.domain = AMDGPU_GEM_DOMAIN_GTT;
> -	bp.flags = AMDGPU_GEM_CREATE_CPU_GTT_USWC;
> -	bp.type = ttm_bo_type_kernel;
> -	bp.resv = bo->tbo.base.resv;
> -	bp.bo_ptr_size = sizeof(struct amdgpu_bo);
> -	bp.xcp_id_plus1 = xcp_id + 1;
> -
> -	r = amdgpu_bo_create(adev, &bp, &(*vmbo)->shadow);
> -
> -	if (!resv)
> -		dma_resv_unlock(bo->tbo.base.resv);
> -
> -	if (r) {
> -		amdgpu_bo_unref(&bo);
> -		return r;
> -	}
> -
> -	amdgpu_bo_add_to_shadow_list(*vmbo);
> -
> -	return 0;
> +	return amdgpu_bo_create_vm(adev, &bp, vmbo);
>  }
>  
>  /**
> @@ -569,7 +523,6 @@ static int amdgpu_vm_pt_alloc(struct amdgpu_device *adev,
>  	return 0;
>  
>  error_free_pt:
> -	amdgpu_bo_unref(&pt->shadow);
>  	amdgpu_bo_unref(&pt_bo);
>  	return r;
>  }
> @@ -581,17 +534,10 @@ static int amdgpu_vm_pt_alloc(struct amdgpu_device *adev,
>   */
>  static void amdgpu_vm_pt_free(struct amdgpu_vm_bo_base *entry)
>  {
> -	struct amdgpu_bo *shadow;
> -
>  	if (!entry->bo)
>  		return;
>  
>  	entry->bo->vm_bo = NULL;
> -	shadow = amdgpu_bo_shadowed(entry->bo);
> -	if (shadow) {
> -		ttm_bo_set_bulk_move(&shadow->tbo, NULL);
> -		amdgpu_bo_unref(&shadow);
> -	}
>  	ttm_bo_set_bulk_move(&entry->bo->tbo, NULL);
>  
>  	spin_lock(&entry->vm->status_lock);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c
> index 9b748d7058b5..390432a22ddd 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c
> @@ -35,16 +35,7 @@
>   */
>  static int amdgpu_vm_sdma_map_table(struct amdgpu_bo_vm *table)
>  {
> -	int r;
> -
> -	r = amdgpu_ttm_alloc_gart(&table->bo.tbo);
> -	if (r)
> -		return r;
> -
> -	if (table->shadow)
> -		r = amdgpu_ttm_alloc_gart(&table->shadow->tbo);
> -
> -	return r;
> +	return amdgpu_ttm_alloc_gart(&table->bo.tbo);
>  }
>  
>  /* Allocate a new job for @count PTE updates */
> @@ -273,17 +264,13 @@ static int amdgpu_vm_sdma_update(struct amdgpu_vm_update_params *p,
>  
>  		if (!p->pages_addr) {
>  			/* set page commands needed */
> -			if (vmbo->shadow)
> -				amdgpu_vm_sdma_set_ptes(p, vmbo->shadow, pe, addr,
> -							count, incr, flags);
>  			amdgpu_vm_sdma_set_ptes(p, bo, pe, addr, count,
>  						incr, flags);
>  			return 0;
>  		}
>  
>  		/* copy commands needed */
> -		ndw -= p->adev->vm_manager.vm_pte_funcs->copy_pte_num_dw *
> -			(vmbo->shadow ? 2 : 1);
> +		ndw -= p->adev->vm_manager.vm_pte_funcs->copy_pte_num_dw;
>  
>  		/* for padding */
>  		ndw -= 7;
> @@ -298,8 +285,6 @@ static int amdgpu_vm_sdma_update(struct amdgpu_vm_update_params *p,
>  			pte[i] |= flags;
>  		}
>  
> -		if (vmbo->shadow)
> -			amdgpu_vm_sdma_copy_ptes(p, vmbo->shadow, pe, nptes);
>  		amdgpu_vm_sdma_copy_ptes(p, bo, pe, nptes);
>  
>  		pe += nptes * 8;


More information about the amd-gfx mailing list