[PATCH] drm/amdgpu: once more fix the call oder in amdgpu_ttm_move()
Christian König
ckoenig.leichtzumerken at gmail.com
Thu Mar 21 14:37:27 UTC 2024
Am 21.03.24 um 15:12 schrieb Tvrtko Ursulin:
>
> On 21/03/2024 12:43, Christian König wrote:
>> This reverts drm/amdgpu: fix ftrace event amdgpu_bo_move always move
>> on same heap. The basic problem here is that after the move the old
>> location is simply not available any more.
>>
>> Some fixes where suggested, but essentially we should call the move
>> notification before actually moving things because only this way we have
>> the correct order for DMA-buf and VM move notifications as well.
>>
>> Also rework the statistic handling so that we don't update the eviction
>> counter before the move.
>>
>> Signed-off-by: Christian König <christian.koenig at amd.com>
>
> Don't forget:
>
> Fixes: 94aeb4117343 ("drm/amdgpu: fix ftrace event amdgpu_bo_move
> always move on same heap")
> Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3171
Ah, thanks. I already wanted to ask if there is any bug report about
that as well.
Regards,
Christian.
>
> ;)
>
> Regards,
>
> Tvrtko
>
>> ---
>> drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 15 +++----
>> drivers/gpu/drm/amd/amdgpu/amdgpu_object.h | 4 +-
>> drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 48 ++++++++++++----------
>> 3 files changed, 37 insertions(+), 30 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>> index 425cebcc5cbf..eb7d824763b9 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>> @@ -1245,19 +1245,20 @@ int amdgpu_bo_get_metadata(struct amdgpu_bo
>> *bo, void *buffer,
>> * amdgpu_bo_move_notify - notification about a memory move
>> * @bo: pointer to a buffer object
>> * @evict: if this move is evicting the buffer from the graphics
>> address space
>> + * @new_mem: new resource for backing the BO
>> *
>> * Marks the corresponding &amdgpu_bo buffer object as invalid,
>> also performs
>> * bookkeeping.
>> * TTM driver callback which is called when ttm moves a buffer.
>> */
>> -void amdgpu_bo_move_notify(struct ttm_buffer_object *bo, bool evict)
>> +void amdgpu_bo_move_notify(struct ttm_buffer_object *bo,
>> + bool evict,
>> + struct ttm_resource *new_mem)
>> {
>> struct amdgpu_device *adev = amdgpu_ttm_adev(bo->bdev);
>> + struct ttm_resource *old_mem = bo->resource;
>> struct amdgpu_bo *abo;
>> - if (!amdgpu_bo_is_amdgpu_bo(bo))
>> - return;
>> -
>> abo = ttm_to_amdgpu_bo(bo);
>> amdgpu_vm_bo_invalidate(adev, abo, evict);
>> @@ -1267,9 +1268,9 @@ void amdgpu_bo_move_notify(struct
>> ttm_buffer_object *bo, bool evict)
>> bo->resource->mem_type != TTM_PL_SYSTEM)
>> dma_buf_move_notify(abo->tbo.base.dma_buf);
>> - /* remember the eviction */
>> - if (evict)
>> - atomic64_inc(&adev->num_evictions);
>> + /* move_notify is called before move happens */
>> + trace_amdgpu_bo_move(abo, new_mem ? new_mem->mem_type : -1,
>> + old_mem ? old_mem->mem_type : -1);
>> }
>> void amdgpu_bo_get_memory(struct amdgpu_bo *bo,
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
>> index a3ea8a82db23..d28e21baef16 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
>> @@ -344,7 +344,9 @@ int amdgpu_bo_set_metadata (struct amdgpu_bo *bo,
>> void *metadata,
>> int amdgpu_bo_get_metadata(struct amdgpu_bo *bo, void *buffer,
>> size_t buffer_size, uint32_t *metadata_size,
>> uint64_t *flags);
>> -void amdgpu_bo_move_notify(struct ttm_buffer_object *bo, bool evict);
>> +void amdgpu_bo_move_notify(struct ttm_buffer_object *bo,
>> + bool evict,
>> + struct ttm_resource *new_mem);
>> void amdgpu_bo_release_notify(struct ttm_buffer_object *bo);
>> vm_fault_t amdgpu_bo_fault_reserve_notify(struct ttm_buffer_object
>> *bo);
>> void amdgpu_bo_fence(struct amdgpu_bo *bo, struct dma_fence *fence,
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>> index a5ceec7820cf..460b23918bfc 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>> @@ -471,14 +471,16 @@ static int amdgpu_bo_move(struct
>> ttm_buffer_object *bo, bool evict,
>> if (!old_mem || (old_mem->mem_type == TTM_PL_SYSTEM &&
>> bo->ttm == NULL)) {
>> + amdgpu_bo_move_notify(bo, evict, new_mem);
>> ttm_bo_move_null(bo, new_mem);
>> - goto out;
>> + return 0;
>> }
>> if (old_mem->mem_type == TTM_PL_SYSTEM &&
>> (new_mem->mem_type == TTM_PL_TT ||
>> new_mem->mem_type == AMDGPU_PL_PREEMPT)) {
>> + amdgpu_bo_move_notify(bo, evict, new_mem);
>> ttm_bo_move_null(bo, new_mem);
>> - goto out;
>> + return 0;
>> }
>> if ((old_mem->mem_type == TTM_PL_TT ||
>> old_mem->mem_type == AMDGPU_PL_PREEMPT) &&
>> @@ -488,9 +490,10 @@ static int amdgpu_bo_move(struct
>> ttm_buffer_object *bo, bool evict,
>> return r;
>> amdgpu_ttm_backend_unbind(bo->bdev, bo->ttm);
>> + amdgpu_bo_move_notify(bo, evict, new_mem);
>> ttm_resource_free(bo, &bo->resource);
>> ttm_bo_assign_mem(bo, new_mem);
>> - goto out;
>> + return 0;
>> }
>> if (old_mem->mem_type == AMDGPU_PL_GDS ||
>> @@ -502,8 +505,9 @@ static int amdgpu_bo_move(struct
>> ttm_buffer_object *bo, bool evict,
>> new_mem->mem_type == AMDGPU_PL_OA ||
>> new_mem->mem_type == AMDGPU_PL_DOORBELL) {
>> /* Nothing to save here */
>> + amdgpu_bo_move_notify(bo, evict, new_mem);
>> ttm_bo_move_null(bo, new_mem);
>> - goto out;
>> + return 0;
>> }
>> if (bo->type == ttm_bo_type_device &&
>> @@ -515,22 +519,23 @@ static int amdgpu_bo_move(struct
>> ttm_buffer_object *bo, bool evict,
>> abo->flags &= ~AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED;
>> }
>> - if (adev->mman.buffer_funcs_enabled) {
>> - if (((old_mem->mem_type == TTM_PL_SYSTEM &&
>> - new_mem->mem_type == TTM_PL_VRAM) ||
>> - (old_mem->mem_type == TTM_PL_VRAM &&
>> - new_mem->mem_type == TTM_PL_SYSTEM))) {
>> - hop->fpfn = 0;
>> - hop->lpfn = 0;
>> - hop->mem_type = TTM_PL_TT;
>> - hop->flags = TTM_PL_FLAG_TEMPORARY;
>> - return -EMULTIHOP;
>> - }
>> + if (adev->mman.buffer_funcs_enabled &&
>> + ((old_mem->mem_type == TTM_PL_SYSTEM &&
>> + new_mem->mem_type == TTM_PL_VRAM) ||
>> + (old_mem->mem_type == TTM_PL_VRAM &&
>> + new_mem->mem_type == TTM_PL_SYSTEM))) {
>> + hop->fpfn = 0;
>> + hop->lpfn = 0;
>> + hop->mem_type = TTM_PL_TT;
>> + hop->flags = TTM_PL_FLAG_TEMPORARY;
>> + return -EMULTIHOP;
>> + }
>> + amdgpu_bo_move_notify(bo, evict, new_mem);
>> + if (adev->mman.buffer_funcs_enabled)
>> r = amdgpu_move_blit(bo, evict, new_mem, old_mem);
>> - } else {
>> + else
>> r = -ENODEV;
>> - }
>> if (r) {
>> /* Check that all memory is CPU accessible */
>> @@ -545,11 +550,10 @@ static int amdgpu_bo_move(struct
>> ttm_buffer_object *bo, bool evict,
>> return r;
>> }
>> - trace_amdgpu_bo_move(abo, new_mem->mem_type, old_mem->mem_type);
>> -out:
>> - /* update statistics */
>> + /* update statistics after the move */
>> + if (evict)
>> + atomic64_inc(&adev->num_evictions);
>> atomic64_add(bo->base.size, &adev->num_bytes_moved);
>> - amdgpu_bo_move_notify(bo, evict);
>> return 0;
>> }
>> @@ -1551,7 +1555,7 @@ static int amdgpu_ttm_access_memory(struct
>> ttm_buffer_object *bo,
>> static void
>> amdgpu_bo_delete_mem_notify(struct ttm_buffer_object *bo)
>> {
>> - amdgpu_bo_move_notify(bo, false);
>> + amdgpu_bo_move_notify(bo, false, NULL);
>> }
>> static struct ttm_device_funcs amdgpu_bo_driver = {
More information about the amd-gfx
mailing list