[PATCH 5/5] drm/amdgpu: fix shadow BO restoring
Zhang, Jerry(Junwei)
Jerry.Zhang at amd.com
Tue Sep 18 06:15:37 UTC 2018
On 09/14/2018 07:54 PM, Christian König wrote:
> Am 13.09.2018 um 11:29 schrieb Zhang, Jerry(Junwei):
>> On 09/11/2018 05:56 PM, Christian König wrote:
>>> Don't grab the reservation lock any more and simplify the handling
>>> quite
>>> a bit.
>>>
>>> Signed-off-by: Christian König <christian.koenig at amd.com>
>>> ---
>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 109
>>> ++++++++---------------------
>>> drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 46 ++++--------
>>> drivers/gpu/drm/amd/amdgpu/amdgpu_object.h | 8 +--
>>> 3 files changed, 43 insertions(+), 120 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> index 5eba66ecf668..20bb702f5c7f 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> @@ -2940,54 +2940,6 @@ static int
>>> amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev)
>>> return 0;
>>> }
>>> -/**
>>> - * amdgpu_device_recover_vram_from_shadow - restore shadowed VRAM
>>> buffers
>>> - *
>>> - * @adev: amdgpu_device pointer
>>> - * @ring: amdgpu_ring for the engine handling the buffer operations
>>> - * @bo: amdgpu_bo buffer whose shadow is being restored
>>> - * @fence: dma_fence associated with the operation
>>> - *
>>> - * Restores the VRAM buffer contents from the shadow in GTT. Used to
>>> - * restore things like GPUVM page tables after a GPU reset where
>>> - * the contents of VRAM might be lost.
>>> - * Returns 0 on success, negative error code on failure.
>>> - */
>>> -static int amdgpu_device_recover_vram_from_shadow(struct
>>> amdgpu_device *adev,
>>> - struct amdgpu_ring *ring,
>>> - struct amdgpu_bo *bo,
>>> - struct dma_fence **fence)
>>> -{
>>> - uint32_t domain;
>>> - int r;
>>> -
>>> - if (!bo->shadow)
>>> - return 0;
>>> -
>>> - r = amdgpu_bo_reserve(bo, true);
>>> - if (r)
>>> - return r;
>>> - domain = amdgpu_mem_type_to_domain(bo->tbo.mem.mem_type);
>>> - /* if bo has been evicted, then no need to recover */
>>> - if (domain == AMDGPU_GEM_DOMAIN_VRAM) {
>>> - r = amdgpu_bo_validate(bo->shadow);
>>> - if (r) {
>>> - DRM_ERROR("bo validate failed!\n");
>>> - goto err;
>>> - }
>>> -
>>> - r = amdgpu_bo_restore_from_shadow(adev, ring, bo,
>>> - NULL, fence, true);
>>> - if (r) {
>>> - DRM_ERROR("recover page table failed!\n");
>>> - goto err;
>>> - }
>>> - }
>>> -err:
>>> - amdgpu_bo_unreserve(bo);
>>> - return r;
>>> -}
>>> -
>>> /**
>>> * amdgpu_device_recover_vram - Recover some VRAM contents
>>> *
>>> @@ -2996,16 +2948,15 @@ static int
>>> amdgpu_device_recover_vram_from_shadow(struct amdgpu_device *adev,
>>> * Restores the contents of VRAM buffers from the shadows in GTT.
>>> Used to
>>> * restore things like GPUVM page tables after a GPU reset where
>>> * the contents of VRAM might be lost.
>>> - * Returns 0 on success, 1 on failure.
>>> + *
>>> + * Returns:
>>> + * 0 on success, negative error code on failure.
>>> */
>>> static int amdgpu_device_recover_vram(struct amdgpu_device *adev)
>>> {
>>> - struct amdgpu_ring *ring = adev->mman.buffer_funcs_ring;
>>> - struct amdgpu_bo *bo, *tmp;
>>> struct dma_fence *fence = NULL, *next = NULL;
>>> - long r = 1;
>>> - int i = 0;
>>> - long tmo;
>>> + struct amdgpu_bo *shadow;
>>> + long r = 1, tmo;
>>> if (amdgpu_sriov_runtime(adev))
>>> tmo = msecs_to_jiffies(8000);
>>> @@ -3014,44 +2965,40 @@ static int amdgpu_device_recover_vram(struct
>>> amdgpu_device *adev)
>>> DRM_INFO("recover vram bo from shadow start\n");
>>> mutex_lock(&adev->shadow_list_lock);
>>> - list_for_each_entry_safe(bo, tmp, &adev->shadow_list,
>>> shadow_list) {
>>> - next = NULL;
>>> - amdgpu_device_recover_vram_from_shadow(adev, ring, bo, &next);
>>> + list_for_each_entry(shadow, &adev->shadow_list, shadow_list) {
>>> +
>>> + /* No need to recover an evicted BO */
>>> + if (shadow->tbo.mem.mem_type != TTM_PL_TT ||
>>> + shadow->parent->tbo.mem.mem_type != TTM_PL_VRAM)
>> is there a change that shadow bo evicted to other domain?
>> like SYSTEM?
>
> Yes, that's why I test "!= TTM_PL_TT" here.
>
> What can happen is that either the shadow or the page table or page
> directory is evicted.
>
> But in this case we don't need to restore anything because of patch #1
> in this series.
Thanks, then it's
Acked-by: Junwei Zhang <Jerry.Zhang at amd.com>
Regards,
Jerry
>
> Regards,
> Christian.
>
>>
>> Regards,
>> Jerry
>>> + continue;
>>> +
>>> + r = amdgpu_bo_restore_shadow(shadow, &next);
>>> + if (r)
>>> + break;
>>> +
>>> if (fence) {
>>> r = dma_fence_wait_timeout(fence, false, tmo);
>>> - if (r == 0)
>>> - pr_err("wait fence %p[%d] timeout\n", fence, i);
>>> - else if (r < 0)
>>> - pr_err("wait fence %p[%d] interrupted\n", fence, i);
>>> - if (r < 1) {
>>> - dma_fence_put(fence);
>>> - fence = next;
>>> + dma_fence_put(fence);
>>> + fence = next;
>>> + if (r <= 0)
>>> break;
>>> - }
>>> - i++;
>>> + } else {
>>> + fence = next;
>>> }
>>> -
>>> - dma_fence_put(fence);
>>> - fence = next;
>>> }
>>> mutex_unlock(&adev->shadow_list_lock);
>>> - if (fence) {
>>> - r = dma_fence_wait_timeout(fence, false, tmo);
>>> - if (r == 0)
>>> - pr_err("wait fence %p[%d] timeout\n", fence, i);
>>> - else if (r < 0)
>>> - pr_err("wait fence %p[%d] interrupted\n", fence, i);
>>> -
>>> - }
>>> + if (fence)
>>> + tmo = dma_fence_wait_timeout(fence, false, tmo);
>>> dma_fence_put(fence);
>>> - if (r > 0)
>>> - DRM_INFO("recover vram bo from shadow done\n");
>>> - else
>>> + if (r <= 0 || tmo <= 0) {
>>> DRM_ERROR("recover vram bo from shadow failed\n");
>>> + return -EIO;
>>> + }
>>> - return (r > 0) ? 0 : 1;
>>> + DRM_INFO("recover vram bo from shadow done\n");
>>> + return 0;
>>> }
>>> /**
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>>> index 3a6f92de5504..5b6d5fcc2151 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>>> @@ -537,7 +537,7 @@ static int amdgpu_bo_create_shadow(struct
>>> amdgpu_device *adev,
>>> if (!r) {
>>> bo->shadow->parent = amdgpu_bo_ref(bo);
>>> mutex_lock(&adev->shadow_list_lock);
>>> - list_add_tail(&bo->shadow_list, &adev->shadow_list);
>>> + list_add_tail(&bo->shadow->shadow_list, &adev->shadow_list);
>>> mutex_unlock(&adev->shadow_list_lock);
>>> }
>>> @@ -669,13 +669,10 @@ int amdgpu_bo_validate(struct amdgpu_bo *bo)
>>> }
>>> /**
>>> - * amdgpu_bo_restore_from_shadow - restore an &amdgpu_bo buffer object
>>> - * @adev: amdgpu device object
>>> - * @ring: amdgpu_ring for the engine handling the buffer operations
>>> - * @bo: &amdgpu_bo buffer to be restored
>>> - * @resv: reservation object with embedded fence
>>> + * amdgpu_bo_restore_shadow - restore an &amdgpu_bo shadow
>>> + *
>>> + * @shadow: &amdgpu_bo shadow to be restored
>>> * @fence: dma_fence associated with the operation
>>> - * @direct: whether to submit the job directly
>>> *
>>> * Copies a buffer object's shadow content back to the object.
>>> * This is used for recovering a buffer from its shadow in case of
>>> a gpu
>>> @@ -684,36 +681,19 @@ int amdgpu_bo_validate(struct amdgpu_bo *bo)
>>> * Returns:
>>> * 0 for success or a negative error code on failure.
>>> */
>>> -int amdgpu_bo_restore_from_shadow(struct amdgpu_device *adev,
>>> - struct amdgpu_ring *ring,
>>> - struct amdgpu_bo *bo,
>>> - struct reservation_object *resv,
>>> - struct dma_fence **fence,
>>> - bool direct)
>>> +int amdgpu_bo_restore_shadow(struct amdgpu_bo *shadow, struct
>>> dma_fence **fence)
>>> {
>>> - struct amdgpu_bo *shadow = bo->shadow;
>>> - uint64_t bo_addr, shadow_addr;
>>> - int r;
>>> -
>>> - if (!shadow)
>>> - return -EINVAL;
>>> -
>>> - bo_addr = amdgpu_bo_gpu_offset(bo);
>>> - shadow_addr = amdgpu_bo_gpu_offset(bo->shadow);
>>> -
>>> - r = reservation_object_reserve_shared(bo->tbo.resv);
>>> - if (r)
>>> - goto err;
>>> + struct amdgpu_device *adev = amdgpu_ttm_adev(shadow->tbo.bdev);
>>> + struct amdgpu_ring *ring = adev->mman.buffer_funcs_ring;
>>> + uint64_t shadow_addr, parent_addr;
>>> - r = amdgpu_copy_buffer(ring, shadow_addr, bo_addr,
>>> - amdgpu_bo_size(bo), resv, fence,
>>> - direct, false);
>>> - if (!r)
>>> - amdgpu_bo_fence(bo, *fence, true);
>>> + shadow_addr = amdgpu_bo_gpu_offset(shadow);
>>> + parent_addr = amdgpu_bo_gpu_offset(shadow->parent);
>>> -err:
>>> - return r;
>>> + return amdgpu_copy_buffer(ring, shadow_addr, parent_addr,
>>> + amdgpu_bo_size(shadow), NULL, fence,
>>> + true, false);
>>> }
>>> /**
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
>>> index 907fdf46d895..363db417fb2e 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
>>> @@ -273,12 +273,8 @@ int amdgpu_bo_backup_to_shadow(struct
>>> amdgpu_device *adev,
>>> struct reservation_object *resv,
>>> struct dma_fence **fence, bool direct);
>>> int amdgpu_bo_validate(struct amdgpu_bo *bo);
>>> -int amdgpu_bo_restore_from_shadow(struct amdgpu_device *adev,
>>> - struct amdgpu_ring *ring,
>>> - struct amdgpu_bo *bo,
>>> - struct reservation_object *resv,
>>> - struct dma_fence **fence,
>>> - bool direct);
>>> +int amdgpu_bo_restore_shadow(struct amdgpu_bo *shadow,
>>> + struct dma_fence **fence);
>>> uint32_t amdgpu_bo_get_preferred_pin_domain(struct amdgpu_device
>>> *adev,
>>> uint32_t domain);
>>
>
More information about the amd-gfx
mailing list