[PATCH 5/5] drm/amdgpu: fix shadow BO restoring

Zhang, Jerry(Junwei) Jerry.Zhang at amd.com
Tue Sep 18 06:15:37 UTC 2018


On 09/14/2018 07:54 PM, Christian König wrote:
> Am 13.09.2018 um 11:29 schrieb Zhang, Jerry(Junwei):
>> On 09/11/2018 05:56 PM, Christian König wrote:
>>> Don't grab the reservation lock any more and simplify the handling 
>>> quite
>>> a bit.
>>>
>>> Signed-off-by: Christian König <christian.koenig at amd.com>
>>> ---
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 109 
>>> ++++++++---------------------
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_object.c |  46 ++++--------
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_object.h |   8 +--
>>>   3 files changed, 43 insertions(+), 120 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> index 5eba66ecf668..20bb702f5c7f 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> @@ -2940,54 +2940,6 @@ static int 
>>> amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev)
>>>       return 0;
>>>   }
>>>   -/**
>>> - * amdgpu_device_recover_vram_from_shadow - restore shadowed VRAM 
>>> buffers
>>> - *
>>> - * @adev: amdgpu_device pointer
>>> - * @ring: amdgpu_ring for the engine handling the buffer operations
>>> - * @bo: amdgpu_bo buffer whose shadow is being restored
>>> - * @fence: dma_fence associated with the operation
>>> - *
>>> - * Restores the VRAM buffer contents from the shadow in GTT. Used to
>>> - * restore things like GPUVM page tables after a GPU reset where
>>> - * the contents of VRAM might be lost.
>>> - * Returns 0 on success, negative error code on failure.
>>> - */
>>> -static int amdgpu_device_recover_vram_from_shadow(struct 
>>> amdgpu_device *adev,
>>> -                          struct amdgpu_ring *ring,
>>> -                          struct amdgpu_bo *bo,
>>> -                          struct dma_fence **fence)
>>> -{
>>> -    uint32_t domain;
>>> -    int r;
>>> -
>>> -    if (!bo->shadow)
>>> -        return 0;
>>> -
>>> -    r = amdgpu_bo_reserve(bo, true);
>>> -    if (r)
>>> -        return r;
>>> -    domain = amdgpu_mem_type_to_domain(bo->tbo.mem.mem_type);
>>> -    /* if bo has been evicted, then no need to recover */
>>> -    if (domain == AMDGPU_GEM_DOMAIN_VRAM) {
>>> -        r = amdgpu_bo_validate(bo->shadow);
>>> -        if (r) {
>>> -            DRM_ERROR("bo validate failed!\n");
>>> -            goto err;
>>> -        }
>>> -
>>> -        r = amdgpu_bo_restore_from_shadow(adev, ring, bo,
>>> -                         NULL, fence, true);
>>> -        if (r) {
>>> -            DRM_ERROR("recover page table failed!\n");
>>> -            goto err;
>>> -        }
>>> -    }
>>> -err:
>>> -    amdgpu_bo_unreserve(bo);
>>> -    return r;
>>> -}
>>> -
>>>   /**
>>>    * amdgpu_device_recover_vram - Recover some VRAM contents
>>>    *
>>> @@ -2996,16 +2948,15 @@ static int 
>>> amdgpu_device_recover_vram_from_shadow(struct amdgpu_device *adev,
>>>    * Restores the contents of VRAM buffers from the shadows in GTT.  
>>> Used to
>>>    * restore things like GPUVM page tables after a GPU reset where
>>>    * the contents of VRAM might be lost.
>>> - * Returns 0 on success, 1 on failure.
>>> + *
>>> + * Returns:
>>> + * 0 on success, negative error code on failure.
>>>    */
>>>   static int amdgpu_device_recover_vram(struct amdgpu_device *adev)
>>>   {
>>> -    struct amdgpu_ring *ring = adev->mman.buffer_funcs_ring;
>>> -    struct amdgpu_bo *bo, *tmp;
>>>       struct dma_fence *fence = NULL, *next = NULL;
>>> -    long r = 1;
>>> -    int i = 0;
>>> -    long tmo;
>>> +    struct amdgpu_bo *shadow;
>>> +    long r = 1, tmo;
>>>         if (amdgpu_sriov_runtime(adev))
>>>           tmo = msecs_to_jiffies(8000);
>>> @@ -3014,44 +2965,40 @@ static int amdgpu_device_recover_vram(struct 
>>> amdgpu_device *adev)
>>>         DRM_INFO("recover vram bo from shadow start\n");
>>>       mutex_lock(&adev->shadow_list_lock);
>>> -    list_for_each_entry_safe(bo, tmp, &adev->shadow_list, 
>>> shadow_list) {
>>> -        next = NULL;
>>> -        amdgpu_device_recover_vram_from_shadow(adev, ring, bo, &next);
>>> +    list_for_each_entry(shadow, &adev->shadow_list, shadow_list) {
>>> +
>>> +        /* No need to recover an evicted BO */
>>> +        if (shadow->tbo.mem.mem_type != TTM_PL_TT ||
>>> +            shadow->parent->tbo.mem.mem_type != TTM_PL_VRAM)
>> is there a change that shadow bo evicted to other domain?
>> like SYSTEM?
>
> Yes, that's why I test "!= TTM_PL_TT" here.
>
> What can happen is that either the shadow or the page table or page 
> directory is evicted.
>
> But in this case we don't need to restore anything because of patch #1 
> in this series.

Thanks, then it's
Acked-by: Junwei Zhang <Jerry.Zhang at amd.com>

Regards,
Jerry

>
> Regards,
> Christian.
>
>>
>> Regards,
>> Jerry
>>> +            continue;
>>> +
>>> +        r = amdgpu_bo_restore_shadow(shadow, &next);
>>> +        if (r)
>>> +            break;
>>> +
>>>           if (fence) {
>>>               r = dma_fence_wait_timeout(fence, false, tmo);
>>> -            if (r == 0)
>>> -                pr_err("wait fence %p[%d] timeout\n", fence, i);
>>> -            else if (r < 0)
>>> -                pr_err("wait fence %p[%d] interrupted\n", fence, i);
>>> -            if (r < 1) {
>>> -                dma_fence_put(fence);
>>> -                fence = next;
>>> +            dma_fence_put(fence);
>>> +            fence = next;
>>> +            if (r <= 0)
>>>                   break;
>>> -            }
>>> -            i++;
>>> +        } else {
>>> +            fence = next;
>>>           }
>>> -
>>> -        dma_fence_put(fence);
>>> -        fence = next;
>>>       }
>>>       mutex_unlock(&adev->shadow_list_lock);
>>>   -    if (fence) {
>>> -        r = dma_fence_wait_timeout(fence, false, tmo);
>>> -        if (r == 0)
>>> -            pr_err("wait fence %p[%d] timeout\n", fence, i);
>>> -        else if (r < 0)
>>> -            pr_err("wait fence %p[%d] interrupted\n", fence, i);
>>> -
>>> -    }
>>> +    if (fence)
>>> +        tmo = dma_fence_wait_timeout(fence, false, tmo);
>>>       dma_fence_put(fence);
>>>   -    if (r > 0)
>>> -        DRM_INFO("recover vram bo from shadow done\n");
>>> -    else
>>> +    if (r <= 0 || tmo <= 0) {
>>>           DRM_ERROR("recover vram bo from shadow failed\n");
>>> +        return -EIO;
>>> +    }
>>>   -    return (r > 0) ? 0 : 1;
>>> +    DRM_INFO("recover vram bo from shadow done\n");
>>> +    return 0;
>>>   }
>>>     /**
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>>> index 3a6f92de5504..5b6d5fcc2151 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>>> @@ -537,7 +537,7 @@ static int amdgpu_bo_create_shadow(struct 
>>> amdgpu_device *adev,
>>>       if (!r) {
>>>           bo->shadow->parent = amdgpu_bo_ref(bo);
>>>           mutex_lock(&adev->shadow_list_lock);
>>> -        list_add_tail(&bo->shadow_list, &adev->shadow_list);
>>> +        list_add_tail(&bo->shadow->shadow_list, &adev->shadow_list);
>>>           mutex_unlock(&adev->shadow_list_lock);
>>>       }
>>>   @@ -669,13 +669,10 @@ int amdgpu_bo_validate(struct amdgpu_bo *bo)
>>>   }
>>>     /**
>>> - * amdgpu_bo_restore_from_shadow - restore an &amdgpu_bo buffer object
>>> - * @adev: amdgpu device object
>>> - * @ring: amdgpu_ring for the engine handling the buffer operations
>>> - * @bo: &amdgpu_bo buffer to be restored
>>> - * @resv: reservation object with embedded fence
>>> + * amdgpu_bo_restore_shadow - restore an &amdgpu_bo shadow
>>> + *
>>> + * @shadow: &amdgpu_bo shadow to be restored
>>>    * @fence: dma_fence associated with the operation
>>> - * @direct: whether to submit the job directly
>>>    *
>>>    * Copies a buffer object's shadow content back to the object.
>>>    * This is used for recovering a buffer from its shadow in case of 
>>> a gpu
>>> @@ -684,36 +681,19 @@ int amdgpu_bo_validate(struct amdgpu_bo *bo)
>>>    * Returns:
>>>    * 0 for success or a negative error code on failure.
>>>    */
>>> -int amdgpu_bo_restore_from_shadow(struct amdgpu_device *adev,
>>> -                  struct amdgpu_ring *ring,
>>> -                  struct amdgpu_bo *bo,
>>> -                  struct reservation_object *resv,
>>> -                  struct dma_fence **fence,
>>> -                  bool direct)
>>> +int amdgpu_bo_restore_shadow(struct amdgpu_bo *shadow, struct 
>>> dma_fence **fence)
>>>     {
>>> -    struct amdgpu_bo *shadow = bo->shadow;
>>> -    uint64_t bo_addr, shadow_addr;
>>> -    int r;
>>> -
>>> -    if (!shadow)
>>> -        return -EINVAL;
>>> -
>>> -    bo_addr = amdgpu_bo_gpu_offset(bo);
>>> -    shadow_addr = amdgpu_bo_gpu_offset(bo->shadow);
>>> -
>>> -    r = reservation_object_reserve_shared(bo->tbo.resv);
>>> -    if (r)
>>> -        goto err;
>>> +    struct amdgpu_device *adev = amdgpu_ttm_adev(shadow->tbo.bdev);
>>> +    struct amdgpu_ring *ring = adev->mman.buffer_funcs_ring;
>>> +    uint64_t shadow_addr, parent_addr;
>>>   -    r = amdgpu_copy_buffer(ring, shadow_addr, bo_addr,
>>> -                   amdgpu_bo_size(bo), resv, fence,
>>> -                   direct, false);
>>> -    if (!r)
>>> -        amdgpu_bo_fence(bo, *fence, true);
>>> +    shadow_addr = amdgpu_bo_gpu_offset(shadow);
>>> +    parent_addr = amdgpu_bo_gpu_offset(shadow->parent);
>>>   -err:
>>> -    return r;
>>> +    return amdgpu_copy_buffer(ring, shadow_addr, parent_addr,
>>> +                  amdgpu_bo_size(shadow), NULL, fence,
>>> +                  true, false);
>>>   }
>>>     /**
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
>>> index 907fdf46d895..363db417fb2e 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
>>> @@ -273,12 +273,8 @@ int amdgpu_bo_backup_to_shadow(struct 
>>> amdgpu_device *adev,
>>>                      struct reservation_object *resv,
>>>                      struct dma_fence **fence, bool direct);
>>>   int amdgpu_bo_validate(struct amdgpu_bo *bo);
>>> -int amdgpu_bo_restore_from_shadow(struct amdgpu_device *adev,
>>> -                  struct amdgpu_ring *ring,
>>> -                  struct amdgpu_bo *bo,
>>> -                  struct reservation_object *resv,
>>> -                  struct dma_fence **fence,
>>> -                  bool direct);
>>> +int amdgpu_bo_restore_shadow(struct amdgpu_bo *shadow,
>>> +                 struct dma_fence **fence);
>>>   uint32_t amdgpu_bo_get_preferred_pin_domain(struct amdgpu_device 
>>> *adev,
>>>                           uint32_t domain);
>>
>



More information about the amd-gfx mailing list