[PATCH 4/4] drm/amdgpu: rework synchronization of VM updates v3

Thu Jan 23 14:35:07 UTC 2020

That is fixed by patch #2 in this series.

Patch #4 is then re-applying the faulty synchronization cleanup.

Regards,
Christian.

Am 23.01.20 um 15:25 schrieb Tom St Denis:
> On the tip of drm-next (as of this morning) I was still getting
>
> [  983.891264] [drm:amdgpu_gem_va_ioctl [amdgpu]] *ERROR* Couldn't 
> update BO_VA (-2)
>
> type errors.  So I'm looking for that to stop.  At least LLVM was 
> fixed so I can run a full run of piglit without gfx hangs.
>
> Tom
>
> On 2020-01-23 9:24 a.m., Christian König wrote:
>> Thanks, please give them a full round.
>>
>> Took me a week to figure out that we accidentally pass in the 
>> reservation object as NULL for cleared BOs.
>>
>> Thanks,
>> Christian.
>>
>> Am 23.01.20 um 15:22 schrieb Tom St Denis:
>>> Just applied these now, trying them out will report back in ~20 mins.
>>>
>>> On 2020-01-23 9:21 a.m., Christian König wrote:
>>>> If provided we only sync to the BOs reservation
>>>> object and no longer to the root PD.
>>>>
>>>> v2: update comment, cleanup amdgpu_bo_sync_wait_resv
>>>> v3: use correct reservation object while clearing
>>>>
>>>> Signed-off-by: Christian König <christian.koenig at amd.com>
>>>> ---
>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_object.c  | 37 
>>>> ++++++++++++++++++++------
>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_object.h  |  3 +++
>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c    |  7 -----
>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c      | 41 
>>>> +++++++++++++++++------------
>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h      |  4 +--
>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm_cpu.c  | 22 ++++------------
>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c | 15 +++--------
>>>>   7 files changed, 67 insertions(+), 62 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c 
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>>>> index 46c76e2e1281..c70bbdda078c 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>>>> @@ -1403,30 +1403,51 @@ void amdgpu_bo_fence(struct amdgpu_bo *bo, 
>>>> struct dma_fence *fence,
>>>>   }
>>>>     /**
>>>> - * amdgpu_sync_wait_resv - Wait for BO reservation fences
>>>> + * amdgpu_bo_sync_wait_resv - Wait for BO reservation fences
>>>>    *
>>>> - * @bo: buffer object
>>>> + * @adev: amdgpu device pointer
>>>> + * @resv: reservation object to sync to
>>>> + * @sync_mode: synchronization mode
>>>>    * @owner: fence owner
>>>>    * @intr: Whether the wait is interruptible
>>>>    *
>>>> + * Extract the fences from the reservation object and waits for 
>>>> them to finish.
>>>> + *
>>>>    * Returns:
>>>>    * 0 on success, errno otherwise.
>>>>    */
>>>> -int amdgpu_bo_sync_wait(struct amdgpu_bo *bo, void *owner, bool intr)
>>>> +int amdgpu_bo_sync_wait_resv(struct amdgpu_device *adev, struct 
>>>> dma_resv *resv,
>>>> +                 enum amdgpu_sync_mode sync_mode, void *owner,
>>>> +                 bool intr)
>>>>   {
>>>> -    struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev);
>>>>       struct amdgpu_sync sync;
>>>>       int r;
>>>>         amdgpu_sync_create(&sync);
>>>> -    amdgpu_sync_resv(adev, &sync, bo->tbo.base.resv,
>>>> -             AMDGPU_SYNC_NE_OWNER, owner);
>>>> -    r = amdgpu_sync_wait(&sync, intr);
>>>> +    amdgpu_sync_resv(adev, &sync, resv, sync_mode, owner);
>>>> +    r = amdgpu_sync_wait(&sync, true);
>>>>       amdgpu_sync_free(&sync);
>>>> -
>>>>       return r;
>>>>   }
>>>>   +/**
>>>> + * amdgpu_bo_sync_wait - Wrapper for amdgpu_bo_sync_wait_resv
>>>> + * @bo: buffer object to wait for
>>>> + * @owner: fence owner
>>>> + * @intr: Whether the wait is interruptible
>>>> + *
>>>> + * Wrapper to wait for fences in a BO.
>>>> + * Returns:
>>>> + * 0 on success, errno otherwise.
>>>> + */
>>>> +int amdgpu_bo_sync_wait(struct amdgpu_bo *bo, void *owner, bool intr)
>>>> +{
>>>> +    struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev);
>>>> +
>>>> +    return amdgpu_bo_sync_wait_resv(adev, bo->tbo.base.resv,
>>>> +                    AMDGPU_SYNC_NE_OWNER, owner, intr);
>>>> +}
>>>> +
>>>>   /**
>>>>    * amdgpu_bo_gpu_offset - return GPU offset of bo
>>>>    * @bo:    amdgpu object for which we query the offset
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h 
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
>>>> index 2eeafc77c9c1..96d805889e8d 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
>>>> @@ -283,6 +283,9 @@ void amdgpu_bo_release_notify(struct 
>>>> ttm_buffer_object *bo);
>>>>   int amdgpu_bo_fault_reserve_notify(struct ttm_buffer_object *bo);
>>>>   void amdgpu_bo_fence(struct amdgpu_bo *bo, struct dma_fence *fence,
>>>>                bool shared);
>>>> +int amdgpu_bo_sync_wait_resv(struct amdgpu_device *adev, struct 
>>>> dma_resv *resv,
>>>> +                 enum amdgpu_sync_mode sync_mode, void *owner,
>>>> +                 bool intr);
>>>>   int amdgpu_bo_sync_wait(struct amdgpu_bo *bo, void *owner, bool 
>>>> intr);
>>>>   u64 amdgpu_bo_gpu_offset(struct amdgpu_bo *bo);
>>>>   int amdgpu_bo_validate(struct amdgpu_bo *bo);
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c 
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c
>>>> index 9f42032676da..b86392253696 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c
>>>> @@ -249,13 +249,6 @@ int amdgpu_sync_resv(struct amdgpu_device 
>>>> *adev, struct amdgpu_sync *sync,
>>>>               owner != AMDGPU_FENCE_OWNER_UNDEFINED)
>>>>               continue;
>>>>   -        /* VM updates only sync with moves but not with user
>>>> -         * command submissions or KFD evictions fences
>>>> -         */
>>>> -        if (fence_owner != AMDGPU_FENCE_OWNER_UNDEFINED &&
>>>> -            owner == AMDGPU_FENCE_OWNER_VM)
>>>> -            continue;
>>>> -
>>>>           /* Ignore fences depending on the sync mode */
>>>>           switch (mode) {
>>>>           case AMDGPU_SYNC_ALWAYS:
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>> index 0f79c17118bf..c268aa14381e 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>> @@ -797,7 +797,7 @@ static int amdgpu_vm_clear_bo(struct 
>>>> amdgpu_device *adev,
>>>>       params.vm = vm;
>>>>       params.direct = direct;
>>>>   -    r = vm->update_funcs->prepare(&params, 
>>>> AMDGPU_FENCE_OWNER_KFD, NULL);
>>>> +    r = vm->update_funcs->prepare(&params, NULL, 
>>>> AMDGPU_SYNC_EXPLICIT);
>>>>       if (r)
>>>>           return r;
>>>>   @@ -1293,7 +1293,7 @@ int amdgpu_vm_update_pdes(struct 
>>>> amdgpu_device *adev,
>>>>       params.vm = vm;
>>>>       params.direct = direct;
>>>>   -    r = vm->update_funcs->prepare(&params, 
>>>> AMDGPU_FENCE_OWNER_VM, NULL);
>>>> +    r = vm->update_funcs->prepare(&params, NULL, 
>>>> AMDGPU_SYNC_EXPLICIT);
>>>>       if (r)
>>>>           return r;
>>>>   @@ -1552,7 +1552,7 @@ static int amdgpu_vm_update_ptes(struct 
>>>> amdgpu_vm_update_params *params,
>>>>    * @adev: amdgpu_device pointer
>>>>    * @vm: requested vm
>>>>    * @direct: direct submission in a page fault
>>>> - * @exclusive: fence we need to sync to
>>>> + * @resv: fences we need to sync to
>>>>    * @start: start of mapped range
>>>>    * @last: last mapped entry
>>>>    * @flags: flags for the entries
>>>> @@ -1567,14 +1567,14 @@ static int amdgpu_vm_update_ptes(struct 
>>>> amdgpu_vm_update_params *params,
>>>>    */
>>>>   static int amdgpu_vm_bo_update_mapping(struct amdgpu_device *adev,
>>>>                          struct amdgpu_vm *vm, bool direct,
>>>> -                       struct dma_fence *exclusive,
>>>> +                       struct dma_resv *resv,
>>>>                          uint64_t start, uint64_t last,
>>>>                          uint64_t flags, uint64_t addr,
>>>>                          dma_addr_t *pages_addr,
>>>>                          struct dma_fence **fence)
>>>>   {
>>>>       struct amdgpu_vm_update_params params;
>>>> -    void *owner = AMDGPU_FENCE_OWNER_VM;
>>>> +    enum amdgpu_sync_mode sync_mode;
>>>>       int r;
>>>>         memset(&params, 0, sizeof(params));
>>>> @@ -1583,9 +1583,13 @@ static int 
>>>> amdgpu_vm_bo_update_mapping(struct amdgpu_device *adev,
>>>>       params.direct = direct;
>>>>       params.pages_addr = pages_addr;
>>>>   -    /* sync to everything except eviction fences on unmapping */
>>>> +    /* Implicitly sync to command submissions in the same VM before
>>>> +     * unmapping. Sync to moving fences before mapping.
>>>> +     */
>>>>       if (!(flags & AMDGPU_PTE_VALID))
>>>> -        owner = AMDGPU_FENCE_OWNER_KFD;
>>>> +        sync_mode = AMDGPU_SYNC_EQ_OWNER;
>>>> +    else
>>>> +        sync_mode = AMDGPU_SYNC_EXPLICIT;
>>>>         amdgpu_vm_eviction_lock(vm);
>>>>       if (vm->evicting) {
>>>> @@ -1593,7 +1597,7 @@ static int amdgpu_vm_bo_update_mapping(struct 
>>>> amdgpu_device *adev,
>>>>           goto error_unlock;
>>>>       }
>>>>   -    r = vm->update_funcs->prepare(&params, owner, exclusive);
>>>> +    r = vm->update_funcs->prepare(&params, resv, sync_mode);
>>>>       if (r)
>>>>           goto error_unlock;
>>>>   @@ -1612,7 +1616,7 @@ static int 
>>>> amdgpu_vm_bo_update_mapping(struct amdgpu_device *adev,
>>>>    * amdgpu_vm_bo_split_mapping - split a mapping into smaller chunks
>>>>    *
>>>>    * @adev: amdgpu_device pointer
>>>> - * @exclusive: fence we need to sync to
>>>> + * @resv: fences we need to sync to
>>>>    * @pages_addr: DMA addresses to use for mapping
>>>>    * @vm: requested vm
>>>>    * @mapping: mapped range and flags to use for the update
>>>> @@ -1628,7 +1632,7 @@ static int amdgpu_vm_bo_update_mapping(struct 
>>>> amdgpu_device *adev,
>>>>    * 0 for success, -EINVAL for failure.
>>>>    */
>>>>   static int amdgpu_vm_bo_split_mapping(struct amdgpu_device *adev,
>>>> -                      struct dma_fence *exclusive,
>>>> +                      struct dma_resv *resv,
>>>>                         dma_addr_t *pages_addr,
>>>>                         struct amdgpu_vm *vm,
>>>>                         struct amdgpu_bo_va_mapping *mapping,
>>>> @@ -1704,7 +1708,7 @@ static int amdgpu_vm_bo_split_mapping(struct 
>>>> amdgpu_device *adev,
>>>>           }
>>>>             last = min((uint64_t)mapping->last, start + max_entries 
>>>> - 1);
>>>> -        r = amdgpu_vm_bo_update_mapping(adev, vm, false, exclusive,
>>>> +        r = amdgpu_vm_bo_update_mapping(adev, vm, false, resv,
>>>>                           start, last, flags, addr,
>>>>                           dma_addr, fence);
>>>>           if (r)
>>>> @@ -1743,7 +1747,8 @@ int amdgpu_vm_bo_update(struct amdgpu_device 
>>>> *adev, struct amdgpu_bo_va *bo_va,
>>>>       dma_addr_t *pages_addr = NULL;
>>>>       struct ttm_mem_reg *mem;
>>>>       struct drm_mm_node *nodes;
>>>> -    struct dma_fence *exclusive, **last_update;
>>>> +    struct dma_fence **last_update;
>>>> +    struct dma_resv *resv;
>>>>       uint64_t flags;
>>>>       struct amdgpu_device *bo_adev = adev;
>>>>       int r;
>>>> @@ -1751,7 +1756,7 @@ int amdgpu_vm_bo_update(struct amdgpu_device 
>>>> *adev, struct amdgpu_bo_va *bo_va,
>>>>       if (clear || !bo) {
>>>>           mem = NULL;
>>>>           nodes = NULL;
>>>> -        exclusive = NULL;
>>>> +        resv = vm->root.base.bo->tbo.base.resv;
>>>>       } else {
>>>>           struct ttm_dma_tt *ttm;
>>>>   @@ -1761,7 +1766,7 @@ int amdgpu_vm_bo_update(struct 
>>>> amdgpu_device *adev, struct amdgpu_bo_va *bo_va,
>>>>               ttm = container_of(bo->tbo.ttm, struct ttm_dma_tt, ttm);
>>>>               pages_addr = ttm->dma_address;
>>>>           }
>>>> -        exclusive = bo->tbo.moving;
>>>> +        resv = bo->tbo.base.resv;
>>>>       }
>>>>         if (bo) {
>>>> @@ -1775,7 +1780,8 @@ int amdgpu_vm_bo_update(struct amdgpu_device 
>>>> *adev, struct amdgpu_bo_va *bo_va,
>>>>           flags = 0x0;
>>>>       }
>>>>   -    if (clear || (bo && bo->tbo.base.resv == 
>>>> vm->root.base.bo->tbo.base.resv))
>>>> +    if (clear || (bo && bo->tbo.base.resv ==
>>>> +              vm->root.base.bo->tbo.base.resv))
>>>>           last_update = &vm->last_update;
>>>>       else
>>>>           last_update = &bo_va->last_pt_update;
>>>> @@ -1789,7 +1795,7 @@ int amdgpu_vm_bo_update(struct amdgpu_device 
>>>> *adev, struct amdgpu_bo_va *bo_va,
>>>>       }
>>>>         list_for_each_entry(mapping, &bo_va->invalids, list) {
>>>> -        r = amdgpu_vm_bo_split_mapping(adev, exclusive, 
>>>> pages_addr, vm,
>>>> +        r = amdgpu_vm_bo_split_mapping(adev, resv, pages_addr, vm,
>>>>                              mapping, flags, bo_adev, nodes,
>>>>                              last_update);
>>>>           if (r)
>>>> @@ -1984,6 +1990,7 @@ int amdgpu_vm_clear_freed(struct 
>>>> amdgpu_device *adev,
>>>>                 struct amdgpu_vm *vm,
>>>>                 struct dma_fence **fence)
>>>>   {
>>>> +    struct dma_resv *resv = vm->root.base.bo->tbo.base.resv;
>>>>       struct amdgpu_bo_va_mapping *mapping;
>>>>       uint64_t init_pte_value = 0;
>>>>       struct dma_fence *f = NULL;
>>>> @@ -1998,7 +2005,7 @@ int amdgpu_vm_clear_freed(struct 
>>>> amdgpu_device *adev,
>>>>               mapping->start < AMDGPU_GMC_HOLE_START)
>>>>               init_pte_value = AMDGPU_PTE_DEFAULT_ATC;
>>>>   -        r = amdgpu_vm_bo_update_mapping(adev, vm, false, NULL,
>>>> +        r = amdgpu_vm_bo_update_mapping(adev, vm, false, resv,
>>>>                           mapping->start, mapping->last,
>>>>                           init_pte_value, 0, NULL, &f);
>>>>           amdgpu_vm_free_mapping(adev, vm, mapping, f);
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h 
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>>>> index c21a36bebc0c..b5705fcfc935 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>>>> @@ -230,8 +230,8 @@ struct amdgpu_vm_update_params {
>>>>     struct amdgpu_vm_update_funcs {
>>>>       int (*map_table)(struct amdgpu_bo *bo);
>>>> -    int (*prepare)(struct amdgpu_vm_update_params *p, void * owner,
>>>> -               struct dma_fence *exclusive);
>>>> +    int (*prepare)(struct amdgpu_vm_update_params *p, struct 
>>>> dma_resv *resv,
>>>> +               enum amdgpu_sync_mode sync_mode);
>>>>       int (*update)(struct amdgpu_vm_update_params *p,
>>>>                 struct amdgpu_bo *bo, uint64_t pe, uint64_t addr,
>>>>                 unsigned count, uint32_t incr, uint64_t flags);
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_cpu.c 
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_cpu.c
>>>> index 68b013be3837..e38516304070 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_cpu.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_cpu.c
>>>> @@ -44,26 +44,14 @@ static int amdgpu_vm_cpu_map_table(struct 
>>>> amdgpu_bo *table)
>>>>    * Returns:
>>>>    * Negativ errno, 0 for success.
>>>>    */
>>>> -static int amdgpu_vm_cpu_prepare(struct amdgpu_vm_update_params 
>>>> *p, void *owner,
>>>> -                 struct dma_fence *exclusive)
>>>> +static int amdgpu_vm_cpu_prepare(struct amdgpu_vm_update_params *p,
>>>> +                 struct dma_resv *resv,
>>>> +                 enum amdgpu_sync_mode sync_mode)
>>>>   {
>>>> -    int r;
>>>> -
>>>> -    /* Wait for any BO move to be completed */
>>>> -    if (exclusive) {
>>>> -        r = dma_fence_wait(exclusive, true);
>>>> -        if (unlikely(r))
>>>> -            return r;
>>>> -    }
>>>> -
>>>> -    /* Don't wait for submissions during page fault */
>>>> -    if (p->direct)
>>>> +    if (!resv)
>>>>           return 0;
>>>>   -    /* Wait for PT BOs to be idle. PTs share the same resv. object
>>>> -     * as the root PD BO
>>>> -     */
>>>> -    return amdgpu_bo_sync_wait(p->vm->root.base.bo, owner, true);
>>>> +    return amdgpu_bo_sync_wait_resv(p->adev, resv, sync_mode, 
>>>> p->vm, true);
>>>>   }
>>>>     /**
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c 
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c
>>>> index ab6481751763..4cc7881f438c 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c
>>>> @@ -58,9 +58,9 @@ static int amdgpu_vm_sdma_map_table(struct 
>>>> amdgpu_bo *table)
>>>>    * Negativ errno, 0 for success.
>>>>    */
>>>>   static int amdgpu_vm_sdma_prepare(struct amdgpu_vm_update_params *p,
>>>> -                  void *owner, struct dma_fence *exclusive)
>>>> +                  struct dma_resv *resv,
>>>> +                  enum amdgpu_sync_mode sync_mode)
>>>>   {
>>>> -    struct amdgpu_bo *root = p->vm->root.base.bo;
>>>>       unsigned int ndw = AMDGPU_VM_SDMA_MIN_NUM_DW;
>>>>       int r;
>>>>   @@ -70,17 +70,10 @@ static int amdgpu_vm_sdma_prepare(struct 
>>>> amdgpu_vm_update_params *p,
>>>>         p->num_dw_left = ndw;
>>>>   -    /* Wait for moves to be completed */
>>>> -    r = amdgpu_sync_fence(&p->job->sync, exclusive, false);
>>>> -    if (r)
>>>> -        return r;
>>>> -
>>>> -    /* Don't wait for any submissions during page fault handling */
>>>> -    if (p->direct)
>>>> +    if (!resv)
>>>>           return 0;
>>>>   -    return amdgpu_sync_resv(p->adev, &p->job->sync, 
>>>> root->tbo.base.resv,
>>>> -                AMDGPU_SYNC_NE_OWNER, owner);
>>>> +    return amdgpu_sync_resv(p->adev, &p->job->sync, resv, 
>>>> sync_mode, p->vm);
>>>>   }
>>>>     /**
>>