[PATCH] drm/amdgpu: once more fix the call oder in amdgpu_ttm_move()

Alex Deucher alexdeucher at gmail.com
Thu Apr 18 16:10:13 UTC 2024


On Thu, Mar 21, 2024 at 10:37 AM Christian König
<ckoenig.leichtzumerken at gmail.com> wrote:
>
> Am 21.03.24 um 15:12 schrieb Tvrtko Ursulin:
> >
> > On 21/03/2024 12:43, Christian König wrote:
> >> This reverts drm/amdgpu: fix ftrace event amdgpu_bo_move always move
> >> on same heap. The basic problem here is that after the move the old
> >> location is simply not available any more.
> >>
> >> Some fixes where suggested, but essentially we should call the move
> >> notification before actually moving things because only this way we have
> >> the correct order for DMA-buf and VM move notifications as well.
> >>
> >> Also rework the statistic handling so that we don't update the eviction
> >> counter before the move.
> >>
> >> Signed-off-by: Christian König <christian.koenig at amd.com>
> >
> > Don't forget:
> >
> > Fixes: 94aeb4117343 ("drm/amdgpu: fix ftrace event amdgpu_bo_move
> > always move on same heap")
> > Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3171
>
> Ah, thanks. I already wanted to ask if there is any bug report about
> that as well.

Did this ever land?  I don't see it anywhere.

Alex

>
> Regards,
> Christian.
>
> >
> > ;)
> >
> > Regards,
> >
> > Tvrtko
> >
> >> ---
> >>   drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 15 +++----
> >>   drivers/gpu/drm/amd/amdgpu/amdgpu_object.h |  4 +-
> >>   drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c    | 48 ++++++++++++----------
> >>   3 files changed, 37 insertions(+), 30 deletions(-)
> >>
> >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> >> index 425cebcc5cbf..eb7d824763b9 100644
> >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> >> @@ -1245,19 +1245,20 @@ int amdgpu_bo_get_metadata(struct amdgpu_bo
> >> *bo, void *buffer,
> >>    * amdgpu_bo_move_notify - notification about a memory move
> >>    * @bo: pointer to a buffer object
> >>    * @evict: if this move is evicting the buffer from the graphics
> >> address space
> >> + * @new_mem: new resource for backing the BO
> >>    *
> >>    * Marks the corresponding &amdgpu_bo buffer object as invalid,
> >> also performs
> >>    * bookkeeping.
> >>    * TTM driver callback which is called when ttm moves a buffer.
> >>    */
> >> -void amdgpu_bo_move_notify(struct ttm_buffer_object *bo, bool evict)
> >> +void amdgpu_bo_move_notify(struct ttm_buffer_object *bo,
> >> +               bool evict,
> >> +               struct ttm_resource *new_mem)
> >>   {
> >>       struct amdgpu_device *adev = amdgpu_ttm_adev(bo->bdev);
> >> +    struct ttm_resource *old_mem = bo->resource;
> >>       struct amdgpu_bo *abo;
> >>   -    if (!amdgpu_bo_is_amdgpu_bo(bo))
> >> -        return;
> >> -
> >>       abo = ttm_to_amdgpu_bo(bo);
> >>       amdgpu_vm_bo_invalidate(adev, abo, evict);
> >>   @@ -1267,9 +1268,9 @@ void amdgpu_bo_move_notify(struct
> >> ttm_buffer_object *bo, bool evict)
> >>           bo->resource->mem_type != TTM_PL_SYSTEM)
> >>           dma_buf_move_notify(abo->tbo.base.dma_buf);
> >>   -    /* remember the eviction */
> >> -    if (evict)
> >> -        atomic64_inc(&adev->num_evictions);
> >> +    /* move_notify is called before move happens */
> >> +    trace_amdgpu_bo_move(abo, new_mem ? new_mem->mem_type : -1,
> >> +                 old_mem ? old_mem->mem_type : -1);
> >>   }
> >>     void amdgpu_bo_get_memory(struct amdgpu_bo *bo,
> >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
> >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
> >> index a3ea8a82db23..d28e21baef16 100644
> >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
> >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
> >> @@ -344,7 +344,9 @@ int amdgpu_bo_set_metadata (struct amdgpu_bo *bo,
> >> void *metadata,
> >>   int amdgpu_bo_get_metadata(struct amdgpu_bo *bo, void *buffer,
> >>                  size_t buffer_size, uint32_t *metadata_size,
> >>                  uint64_t *flags);
> >> -void amdgpu_bo_move_notify(struct ttm_buffer_object *bo, bool evict);
> >> +void amdgpu_bo_move_notify(struct ttm_buffer_object *bo,
> >> +               bool evict,
> >> +               struct ttm_resource *new_mem);
> >>   void amdgpu_bo_release_notify(struct ttm_buffer_object *bo);
> >>   vm_fault_t amdgpu_bo_fault_reserve_notify(struct ttm_buffer_object
> >> *bo);
> >>   void amdgpu_bo_fence(struct amdgpu_bo *bo, struct dma_fence *fence,
> >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> >> index a5ceec7820cf..460b23918bfc 100644
> >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> >> @@ -471,14 +471,16 @@ static int amdgpu_bo_move(struct
> >> ttm_buffer_object *bo, bool evict,
> >>         if (!old_mem || (old_mem->mem_type == TTM_PL_SYSTEM &&
> >>                bo->ttm == NULL)) {
> >> +        amdgpu_bo_move_notify(bo, evict, new_mem);
> >>           ttm_bo_move_null(bo, new_mem);
> >> -        goto out;
> >> +        return 0;
> >>       }
> >>       if (old_mem->mem_type == TTM_PL_SYSTEM &&
> >>           (new_mem->mem_type == TTM_PL_TT ||
> >>            new_mem->mem_type == AMDGPU_PL_PREEMPT)) {
> >> +        amdgpu_bo_move_notify(bo, evict, new_mem);
> >>           ttm_bo_move_null(bo, new_mem);
> >> -        goto out;
> >> +        return 0;
> >>       }
> >>       if ((old_mem->mem_type == TTM_PL_TT ||
> >>            old_mem->mem_type == AMDGPU_PL_PREEMPT) &&
> >> @@ -488,9 +490,10 @@ static int amdgpu_bo_move(struct
> >> ttm_buffer_object *bo, bool evict,
> >>               return r;
> >>             amdgpu_ttm_backend_unbind(bo->bdev, bo->ttm);
> >> +        amdgpu_bo_move_notify(bo, evict, new_mem);
> >>           ttm_resource_free(bo, &bo->resource);
> >>           ttm_bo_assign_mem(bo, new_mem);
> >> -        goto out;
> >> +        return 0;
> >>       }
> >>         if (old_mem->mem_type == AMDGPU_PL_GDS ||
> >> @@ -502,8 +505,9 @@ static int amdgpu_bo_move(struct
> >> ttm_buffer_object *bo, bool evict,
> >>           new_mem->mem_type == AMDGPU_PL_OA ||
> >>           new_mem->mem_type == AMDGPU_PL_DOORBELL) {
> >>           /* Nothing to save here */
> >> +        amdgpu_bo_move_notify(bo, evict, new_mem);
> >>           ttm_bo_move_null(bo, new_mem);
> >> -        goto out;
> >> +        return 0;
> >>       }
> >>         if (bo->type == ttm_bo_type_device &&
> >> @@ -515,22 +519,23 @@ static int amdgpu_bo_move(struct
> >> ttm_buffer_object *bo, bool evict,
> >>           abo->flags &= ~AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED;
> >>       }
> >>   -    if (adev->mman.buffer_funcs_enabled) {
> >> -        if (((old_mem->mem_type == TTM_PL_SYSTEM &&
> >> -              new_mem->mem_type == TTM_PL_VRAM) ||
> >> -             (old_mem->mem_type == TTM_PL_VRAM &&
> >> -              new_mem->mem_type == TTM_PL_SYSTEM))) {
> >> -            hop->fpfn = 0;
> >> -            hop->lpfn = 0;
> >> -            hop->mem_type = TTM_PL_TT;
> >> -            hop->flags = TTM_PL_FLAG_TEMPORARY;
> >> -            return -EMULTIHOP;
> >> -        }
> >> +    if (adev->mman.buffer_funcs_enabled &&
> >> +        ((old_mem->mem_type == TTM_PL_SYSTEM &&
> >> +          new_mem->mem_type == TTM_PL_VRAM) ||
> >> +         (old_mem->mem_type == TTM_PL_VRAM &&
> >> +          new_mem->mem_type == TTM_PL_SYSTEM))) {
> >> +        hop->fpfn = 0;
> >> +        hop->lpfn = 0;
> >> +        hop->mem_type = TTM_PL_TT;
> >> +        hop->flags = TTM_PL_FLAG_TEMPORARY;
> >> +        return -EMULTIHOP;
> >> +    }
> >>   +    amdgpu_bo_move_notify(bo, evict, new_mem);
> >> +    if (adev->mman.buffer_funcs_enabled)
> >>           r = amdgpu_move_blit(bo, evict, new_mem, old_mem);
> >> -    } else {
> >> +    else
> >>           r = -ENODEV;
> >> -    }
> >>         if (r) {
> >>           /* Check that all memory is CPU accessible */
> >> @@ -545,11 +550,10 @@ static int amdgpu_bo_move(struct
> >> ttm_buffer_object *bo, bool evict,
> >>               return r;
> >>       }
> >>   -    trace_amdgpu_bo_move(abo, new_mem->mem_type, old_mem->mem_type);
> >> -out:
> >> -    /* update statistics */
> >> +    /* update statistics after the move */
> >> +    if (evict)
> >> +        atomic64_inc(&adev->num_evictions);
> >>       atomic64_add(bo->base.size, &adev->num_bytes_moved);
> >> -    amdgpu_bo_move_notify(bo, evict);
> >>       return 0;
> >>   }
> >>   @@ -1551,7 +1555,7 @@ static int amdgpu_ttm_access_memory(struct
> >> ttm_buffer_object *bo,
> >>   static void
> >>   amdgpu_bo_delete_mem_notify(struct ttm_buffer_object *bo)
> >>   {
> >> -    amdgpu_bo_move_notify(bo, false);
> >> +    amdgpu_bo_move_notify(bo, false, NULL);
> >>   }
> >>     static struct ttm_device_funcs amdgpu_bo_driver = {
>


More information about the amd-gfx mailing list