[PATCH 5/5] drm/amdgpu: fix shadow BO restoring

Christian König ckoenig.leichtzumerken at gmail.com
Mon Sep 17 11:42:38 UTC 2018


Am 17.09.2018 um 11:10 schrieb Huang Rui:
> On Fri, Sep 14, 2018 at 03:42:57PM +0200, Christian König wrote:
>> Don't grab the reservation lock any more and simplify the handling quite
>> a bit.
>>
>> Signed-off-by: Christian König <christian.koenig at amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 109 ++++++++---------------------
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_object.c |  46 ++++--------
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_object.h |   8 +--
>>   3 files changed, 43 insertions(+), 120 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index 899342c6dfad..1cbc372964f8 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -2954,54 +2954,6 @@ static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev)
>>   	return 0;
>>   }
>>   
>> -/**
>> - * amdgpu_device_recover_vram_from_shadow - restore shadowed VRAM buffers
>> - *
>> - * @adev: amdgpu_device pointer
>> - * @ring: amdgpu_ring for the engine handling the buffer operations
>> - * @bo: amdgpu_bo buffer whose shadow is being restored
>> - * @fence: dma_fence associated with the operation
>> - *
>> - * Restores the VRAM buffer contents from the shadow in GTT.  Used to
>> - * restore things like GPUVM page tables after a GPU reset where
>> - * the contents of VRAM might be lost.
>> - * Returns 0 on success, negative error code on failure.
>> - */
>> -static int amdgpu_device_recover_vram_from_shadow(struct amdgpu_device *adev,
>> -						  struct amdgpu_ring *ring,
>> -						  struct amdgpu_bo *bo,
>> -						  struct dma_fence **fence)
>> -{
>> -	uint32_t domain;
>> -	int r;
>> -
>> -	if (!bo->shadow)
>> -		return 0;
>> -
>> -	r = amdgpu_bo_reserve(bo, true);
>> -	if (r)
>> -		return r;
>> -	domain = amdgpu_mem_type_to_domain(bo->tbo.mem.mem_type);
>> -	/* if bo has been evicted, then no need to recover */
>> -	if (domain == AMDGPU_GEM_DOMAIN_VRAM) {
>> -		r = amdgpu_bo_validate(bo->shadow);
>> -		if (r) {
>> -			DRM_ERROR("bo validate failed!\n");
>> -			goto err;
>> -		}
>> -
>> -		r = amdgpu_bo_restore_from_shadow(adev, ring, bo,
>> -						 NULL, fence, true);
>> -		if (r) {
>> -			DRM_ERROR("recover page table failed!\n");
>> -			goto err;
>> -		}
>> -	}
>> -err:
>> -	amdgpu_bo_unreserve(bo);
>> -	return r;
>> -}
>> -
>>   /**
>>    * amdgpu_device_recover_vram - Recover some VRAM contents
>>    *
>> @@ -3010,16 +2962,15 @@ static int amdgpu_device_recover_vram_from_shadow(struct amdgpu_device *adev,
>>    * Restores the contents of VRAM buffers from the shadows in GTT.  Used to
>>    * restore things like GPUVM page tables after a GPU reset where
>>    * the contents of VRAM might be lost.
>> - * Returns 0 on success, 1 on failure.
>> + *
>> + * Returns:
>> + * 0 on success, negative error code on failure.
>>    */
>>   static int amdgpu_device_recover_vram(struct amdgpu_device *adev)
>>   {
>> -	struct amdgpu_ring *ring = adev->mman.buffer_funcs_ring;
>> -	struct amdgpu_bo *bo, *tmp;
>>   	struct dma_fence *fence = NULL, *next = NULL;
>> -	long r = 1;
>> -	int i = 0;
>> -	long tmo;
>> +	struct amdgpu_bo *shadow;
>> +	long r = 1, tmo;
>>   
>>   	if (amdgpu_sriov_runtime(adev))
>>   		tmo = msecs_to_jiffies(8000);
>> @@ -3028,44 +2979,40 @@ static int amdgpu_device_recover_vram(struct amdgpu_device *adev)
>>   
>>   	DRM_INFO("recover vram bo from shadow start\n");
>>   	mutex_lock(&adev->shadow_list_lock);
>> -	list_for_each_entry_safe(bo, tmp, &adev->shadow_list, shadow_list) {
>> -		next = NULL;
>> -		amdgpu_device_recover_vram_from_shadow(adev, ring, bo, &next);
>> +	list_for_each_entry(shadow, &adev->shadow_list, shadow_list) {
>> +
>> +		/* No need to recover an evicted BO */
>> +		if (shadow->tbo.mem.mem_type != TTM_PL_TT ||
>> +		    shadow->parent->tbo.mem.mem_type != TTM_PL_VRAM)
>> +			continue;
>> +
>> +		r = amdgpu_bo_restore_shadow(shadow, &next);
>> +		if (r)
>> +			break;
>> +
>>   		if (fence) {
>>   			r = dma_fence_wait_timeout(fence, false, tmo);
>> -			if (r == 0)
>> -				pr_err("wait fence %p[%d] timeout\n", fence, i);
>> -			else if (r < 0)
>> -				pr_err("wait fence %p[%d] interrupted\n", fence, i);
>> -			if (r < 1) {
>> -				dma_fence_put(fence);
>> -				fence = next;
>> +			dma_fence_put(fence);
>> +			fence = next;
>> +			if (r <= 0)
>>   				break;
>> -			}
>> -			i++;
>> +		} else {
>> +			fence = next;
>>   		}
>> -
>> -		dma_fence_put(fence);
>> -		fence = next;
>>   	}
>>   	mutex_unlock(&adev->shadow_list_lock);
>>   
>> -	if (fence) {
>> -		r = dma_fence_wait_timeout(fence, false, tmo);
>> -		if (r == 0)
>> -			pr_err("wait fence %p[%d] timeout\n", fence, i);
>> -		else if (r < 0)
>> -			pr_err("wait fence %p[%d] interrupted\n", fence, i);
>> -
>> -	}
>> +	if (fence)
>> +		tmo = dma_fence_wait_timeout(fence, false, tmo);
>>   	dma_fence_put(fence);
>>   
>> -	if (r > 0)
>> -		DRM_INFO("recover vram bo from shadow done\n");
>> -	else
>> +	if (r <= 0 || tmo <= 0) {
>>   		DRM_ERROR("recover vram bo from shadow failed\n");
>> +		return -EIO;
>> +	}
>>   
>> -	return (r > 0) ? 0 : 1;
>> +	DRM_INFO("recover vram bo from shadow done\n");
>> +	return 0;
>>   }
>>   
>>   /**
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>> index 650c45c896f0..ca8a0b7a5ac3 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>> @@ -547,7 +547,7 @@ static int amdgpu_bo_create_shadow(struct amdgpu_device *adev,
>>   	if (!r) {
>>   		bo->shadow->parent = amdgpu_bo_ref(bo);
>>   		mutex_lock(&adev->shadow_list_lock);
>> -		list_add_tail(&bo->shadow_list, &adev->shadow_list);
>> +		list_add_tail(&bo->shadow->shadow_list, &adev->shadow_list);
>>   		mutex_unlock(&adev->shadow_list_lock);
>>   	}
>>   
>> @@ -679,13 +679,10 @@ int amdgpu_bo_validate(struct amdgpu_bo *bo)
>>   }
>>   
>>   /**
>> - * amdgpu_bo_restore_from_shadow - restore an &amdgpu_bo buffer object
>> - * @adev: amdgpu device object
>> - * @ring: amdgpu_ring for the engine handling the buffer operations
>> - * @bo: &amdgpu_bo buffer to be restored
>> - * @resv: reservation object with embedded fence
>> + * amdgpu_bo_restore_shadow - restore an &amdgpu_bo shadow
>> + *
>> + * @shadow: &amdgpu_bo shadow to be restored
>>    * @fence: dma_fence associated with the operation
>> - * @direct: whether to submit the job directly
>>    *
>>    * Copies a buffer object's shadow content back to the object.
>>    * This is used for recovering a buffer from its shadow in case of a gpu
>> @@ -694,36 +691,19 @@ int amdgpu_bo_validate(struct amdgpu_bo *bo)
>>    * Returns:
>>    * 0 for success or a negative error code on failure.
>>    */
>> -int amdgpu_bo_restore_from_shadow(struct amdgpu_device *adev,
>> -				  struct amdgpu_ring *ring,
>> -				  struct amdgpu_bo *bo,
>> -				  struct reservation_object *resv,
>> -				  struct dma_fence **fence,
>> -				  bool direct)
>> +int amdgpu_bo_restore_shadow(struct amdgpu_bo *shadow, struct dma_fence **fence)
>>   
>>   {
>> -	struct amdgpu_bo *shadow = bo->shadow;
>> -	uint64_t bo_addr, shadow_addr;
>> -	int r;
>> -
>> -	if (!shadow)
>> -		return -EINVAL;
>> -
>> -	bo_addr = amdgpu_bo_gpu_offset(bo);
>> -	shadow_addr = amdgpu_bo_gpu_offset(bo->shadow);
>> -
>> -	r = reservation_object_reserve_shared(bo->tbo.resv);
>> -	if (r)
>> -		goto err;
>> +	struct amdgpu_device *adev = amdgpu_ttm_adev(shadow->tbo.bdev);
>> +	struct amdgpu_ring *ring = adev->mman.buffer_funcs_ring;
>> +	uint64_t shadow_addr, parent_addr;
>>   
> May I know why won't need the reservation_object_reserve_shared() here? Is
> it because we only copy buffer from shadow parent bo instead of orignal bo?

The scheduler and delayed free thread are disabled and we wait for the 
copy to finish before allowing any eviction to proceed.

Adding the BO as shared fence to the BO could actually be harmful 
because TTM might already want to destroy it.

Regards,
Christian.

>
> Thanks,
> Ray
>
>> -	r = amdgpu_copy_buffer(ring, shadow_addr, bo_addr,
>> -			       amdgpu_bo_size(bo), resv, fence,
>> -			       direct, false);
>> -	if (!r)
>> -		amdgpu_bo_fence(bo, *fence, true);
>> +	shadow_addr = amdgpu_bo_gpu_offset(shadow);
>> +	parent_addr = amdgpu_bo_gpu_offset(shadow->parent);
>>   
>> -err:
>> -	return r;
>> +	return amdgpu_copy_buffer(ring, shadow_addr, parent_addr,
>> +				  amdgpu_bo_size(shadow), NULL, fence,
>> +				  true, false);
>>   }
>>   
>>   /**
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
>> index 64337ff2ad63..7d3312d0da11 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
>> @@ -273,12 +273,8 @@ int amdgpu_bo_backup_to_shadow(struct amdgpu_device *adev,
>>   			       struct reservation_object *resv,
>>   			       struct dma_fence **fence, bool direct);
>>   int amdgpu_bo_validate(struct amdgpu_bo *bo);
>> -int amdgpu_bo_restore_from_shadow(struct amdgpu_device *adev,
>> -				  struct amdgpu_ring *ring,
>> -				  struct amdgpu_bo *bo,
>> -				  struct reservation_object *resv,
>> -				  struct dma_fence **fence,
>> -				  bool direct);
>> +int amdgpu_bo_restore_shadow(struct amdgpu_bo *shadow,
>> +			     struct dma_fence **fence);
>>   uint32_t amdgpu_bo_get_preferred_pin_domain(struct amdgpu_device *adev,
>>   					    uint32_t domain);
>>   
>> -- 
>> 2.14.1
>>
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx at lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx



More information about the amd-gfx mailing list