[PATCH] drm/amdgpu: once more fix the call oder in amdgpu_ttm_move()

Christian König ckoenig.leichtzumerken at gmail.com
Mon Apr 22 15:05:34 UTC 2024


Am 18.04.24 um 18:10 schrieb Alex Deucher:
> On Thu, Mar 21, 2024 at 10:37 AM Christian König
> <ckoenig.leichtzumerken at gmail.com> wrote:
>> Am 21.03.24 um 15:12 schrieb Tvrtko Ursulin:
>>> On 21/03/2024 12:43, Christian König wrote:
>>>> This reverts drm/amdgpu: fix ftrace event amdgpu_bo_move always move
>>>> on same heap. The basic problem here is that after the move the old
>>>> location is simply not available any more.
>>>>
>>>> Some fixes where suggested, but essentially we should call the move
>>>> notification before actually moving things because only this way we have
>>>> the correct order for DMA-buf and VM move notifications as well.
>>>>
>>>> Also rework the statistic handling so that we don't update the eviction
>>>> counter before the move.
>>>>
>>>> Signed-off-by: Christian König <christian.koenig at amd.com>
>>> Don't forget:
>>>
>>> Fixes: 94aeb4117343 ("drm/amdgpu: fix ftrace event amdgpu_bo_move
>>> always move on same heap")
>>> Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3171
>> Ah, thanks. I already wanted to ask if there is any bug report about
>> that as well.
> Did this ever land?  I don't see it anywhere.

No, I never found time to actually rebase and push it.

Just did so 10 minutes ago, should probably show up in 
amd-staging-drm-next unless there isn't any CI hickup again.

Christian.

>
> Alex
>
>> Regards,
>> Christian.
>>
>>> ;)
>>>
>>> Regards,
>>>
>>> Tvrtko
>>>
>>>> ---
>>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 15 +++----
>>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_object.h |  4 +-
>>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c    | 48 ++++++++++++----------
>>>>    3 files changed, 37 insertions(+), 30 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>>>> index 425cebcc5cbf..eb7d824763b9 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>>>> @@ -1245,19 +1245,20 @@ int amdgpu_bo_get_metadata(struct amdgpu_bo
>>>> *bo, void *buffer,
>>>>     * amdgpu_bo_move_notify - notification about a memory move
>>>>     * @bo: pointer to a buffer object
>>>>     * @evict: if this move is evicting the buffer from the graphics
>>>> address space
>>>> + * @new_mem: new resource for backing the BO
>>>>     *
>>>>     * Marks the corresponding &amdgpu_bo buffer object as invalid,
>>>> also performs
>>>>     * bookkeeping.
>>>>     * TTM driver callback which is called when ttm moves a buffer.
>>>>     */
>>>> -void amdgpu_bo_move_notify(struct ttm_buffer_object *bo, bool evict)
>>>> +void amdgpu_bo_move_notify(struct ttm_buffer_object *bo,
>>>> +               bool evict,
>>>> +               struct ttm_resource *new_mem)
>>>>    {
>>>>        struct amdgpu_device *adev = amdgpu_ttm_adev(bo->bdev);
>>>> +    struct ttm_resource *old_mem = bo->resource;
>>>>        struct amdgpu_bo *abo;
>>>>    -    if (!amdgpu_bo_is_amdgpu_bo(bo))
>>>> -        return;
>>>> -
>>>>        abo = ttm_to_amdgpu_bo(bo);
>>>>        amdgpu_vm_bo_invalidate(adev, abo, evict);
>>>>    @@ -1267,9 +1268,9 @@ void amdgpu_bo_move_notify(struct
>>>> ttm_buffer_object *bo, bool evict)
>>>>            bo->resource->mem_type != TTM_PL_SYSTEM)
>>>>            dma_buf_move_notify(abo->tbo.base.dma_buf);
>>>>    -    /* remember the eviction */
>>>> -    if (evict)
>>>> -        atomic64_inc(&adev->num_evictions);
>>>> +    /* move_notify is called before move happens */
>>>> +    trace_amdgpu_bo_move(abo, new_mem ? new_mem->mem_type : -1,
>>>> +                 old_mem ? old_mem->mem_type : -1);
>>>>    }
>>>>      void amdgpu_bo_get_memory(struct amdgpu_bo *bo,
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
>>>> index a3ea8a82db23..d28e21baef16 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
>>>> @@ -344,7 +344,9 @@ int amdgpu_bo_set_metadata (struct amdgpu_bo *bo,
>>>> void *metadata,
>>>>    int amdgpu_bo_get_metadata(struct amdgpu_bo *bo, void *buffer,
>>>>                   size_t buffer_size, uint32_t *metadata_size,
>>>>                   uint64_t *flags);
>>>> -void amdgpu_bo_move_notify(struct ttm_buffer_object *bo, bool evict);
>>>> +void amdgpu_bo_move_notify(struct ttm_buffer_object *bo,
>>>> +               bool evict,
>>>> +               struct ttm_resource *new_mem);
>>>>    void amdgpu_bo_release_notify(struct ttm_buffer_object *bo);
>>>>    vm_fault_t amdgpu_bo_fault_reserve_notify(struct ttm_buffer_object
>>>> *bo);
>>>>    void amdgpu_bo_fence(struct amdgpu_bo *bo, struct dma_fence *fence,
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>>>> index a5ceec7820cf..460b23918bfc 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>>>> @@ -471,14 +471,16 @@ static int amdgpu_bo_move(struct
>>>> ttm_buffer_object *bo, bool evict,
>>>>          if (!old_mem || (old_mem->mem_type == TTM_PL_SYSTEM &&
>>>>                 bo->ttm == NULL)) {
>>>> +        amdgpu_bo_move_notify(bo, evict, new_mem);
>>>>            ttm_bo_move_null(bo, new_mem);
>>>> -        goto out;
>>>> +        return 0;
>>>>        }
>>>>        if (old_mem->mem_type == TTM_PL_SYSTEM &&
>>>>            (new_mem->mem_type == TTM_PL_TT ||
>>>>             new_mem->mem_type == AMDGPU_PL_PREEMPT)) {
>>>> +        amdgpu_bo_move_notify(bo, evict, new_mem);
>>>>            ttm_bo_move_null(bo, new_mem);
>>>> -        goto out;
>>>> +        return 0;
>>>>        }
>>>>        if ((old_mem->mem_type == TTM_PL_TT ||
>>>>             old_mem->mem_type == AMDGPU_PL_PREEMPT) &&
>>>> @@ -488,9 +490,10 @@ static int amdgpu_bo_move(struct
>>>> ttm_buffer_object *bo, bool evict,
>>>>                return r;
>>>>              amdgpu_ttm_backend_unbind(bo->bdev, bo->ttm);
>>>> +        amdgpu_bo_move_notify(bo, evict, new_mem);
>>>>            ttm_resource_free(bo, &bo->resource);
>>>>            ttm_bo_assign_mem(bo, new_mem);
>>>> -        goto out;
>>>> +        return 0;
>>>>        }
>>>>          if (old_mem->mem_type == AMDGPU_PL_GDS ||
>>>> @@ -502,8 +505,9 @@ static int amdgpu_bo_move(struct
>>>> ttm_buffer_object *bo, bool evict,
>>>>            new_mem->mem_type == AMDGPU_PL_OA ||
>>>>            new_mem->mem_type == AMDGPU_PL_DOORBELL) {
>>>>            /* Nothing to save here */
>>>> +        amdgpu_bo_move_notify(bo, evict, new_mem);
>>>>            ttm_bo_move_null(bo, new_mem);
>>>> -        goto out;
>>>> +        return 0;
>>>>        }
>>>>          if (bo->type == ttm_bo_type_device &&
>>>> @@ -515,22 +519,23 @@ static int amdgpu_bo_move(struct
>>>> ttm_buffer_object *bo, bool evict,
>>>>            abo->flags &= ~AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED;
>>>>        }
>>>>    -    if (adev->mman.buffer_funcs_enabled) {
>>>> -        if (((old_mem->mem_type == TTM_PL_SYSTEM &&
>>>> -              new_mem->mem_type == TTM_PL_VRAM) ||
>>>> -             (old_mem->mem_type == TTM_PL_VRAM &&
>>>> -              new_mem->mem_type == TTM_PL_SYSTEM))) {
>>>> -            hop->fpfn = 0;
>>>> -            hop->lpfn = 0;
>>>> -            hop->mem_type = TTM_PL_TT;
>>>> -            hop->flags = TTM_PL_FLAG_TEMPORARY;
>>>> -            return -EMULTIHOP;
>>>> -        }
>>>> +    if (adev->mman.buffer_funcs_enabled &&
>>>> +        ((old_mem->mem_type == TTM_PL_SYSTEM &&
>>>> +          new_mem->mem_type == TTM_PL_VRAM) ||
>>>> +         (old_mem->mem_type == TTM_PL_VRAM &&
>>>> +          new_mem->mem_type == TTM_PL_SYSTEM))) {
>>>> +        hop->fpfn = 0;
>>>> +        hop->lpfn = 0;
>>>> +        hop->mem_type = TTM_PL_TT;
>>>> +        hop->flags = TTM_PL_FLAG_TEMPORARY;
>>>> +        return -EMULTIHOP;
>>>> +    }
>>>>    +    amdgpu_bo_move_notify(bo, evict, new_mem);
>>>> +    if (adev->mman.buffer_funcs_enabled)
>>>>            r = amdgpu_move_blit(bo, evict, new_mem, old_mem);
>>>> -    } else {
>>>> +    else
>>>>            r = -ENODEV;
>>>> -    }
>>>>          if (r) {
>>>>            /* Check that all memory is CPU accessible */
>>>> @@ -545,11 +550,10 @@ static int amdgpu_bo_move(struct
>>>> ttm_buffer_object *bo, bool evict,
>>>>                return r;
>>>>        }
>>>>    -    trace_amdgpu_bo_move(abo, new_mem->mem_type, old_mem->mem_type);
>>>> -out:
>>>> -    /* update statistics */
>>>> +    /* update statistics after the move */
>>>> +    if (evict)
>>>> +        atomic64_inc(&adev->num_evictions);
>>>>        atomic64_add(bo->base.size, &adev->num_bytes_moved);
>>>> -    amdgpu_bo_move_notify(bo, evict);
>>>>        return 0;
>>>>    }
>>>>    @@ -1551,7 +1555,7 @@ static int amdgpu_ttm_access_memory(struct
>>>> ttm_buffer_object *bo,
>>>>    static void
>>>>    amdgpu_bo_delete_mem_notify(struct ttm_buffer_object *bo)
>>>>    {
>>>> -    amdgpu_bo_move_notify(bo, false);
>>>> +    amdgpu_bo_move_notify(bo, false, NULL);
>>>>    }
>>>>      static struct ttm_device_funcs amdgpu_bo_driver = {



More information about the amd-gfx mailing list