[PATCH 2/3] drm/amdgpu: sync to KFD fences before clearing PTEs

Christian König ckoenig.leichtzumerken at gmail.com
Thu Aug 22 09:07:03 UTC 2024


Am 21.08.24 um 22:01 schrieb Felix Kuehling:
> On 2024-08-21 08:03, Christian König wrote:
>> This patch tries to solve the basic problem we also need to sync to
>> the KFD fences of the BO because otherwise it can be that we clear
>> PTEs while the KFD queues are still running.
>
> This is going to trigger a lot of phantom KFD evictions and will tank 
> performance. It's probably not what you intended.

I tried to avoid that by only waiting for the KFD fence only in the 
particular situation that we can't lock the cleared BO because of 
contention.

The only short term alternative I can see is to lock all BOs during CS 
and that is a) a really large rework and b) will most likely hurt 
performance.

Then there is the alternative to lock the VM during BO eviction, but 
that means we need to wait on using the drm_exec object inside TTM as 
well. So that won't get this fixed in the next halve year or so.

Regards,
Christian.

>
> Regards,
>   Felix
>
>
>>
>> Signed-off-by: Christian König <christian.koenig at amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c | 30 ++++++++++++++++++++++++
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_sync.h |  1 +
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c   |  6 +++++
>>   3 files changed, 37 insertions(+)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c
>> index bdf1ef825d89..c586ab4c911b 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c
>> @@ -260,6 +260,36 @@ int amdgpu_sync_resv(struct amdgpu_device *adev, 
>> struct amdgpu_sync *sync,
>>       return 0;
>>   }
>>   +/**
>> + * amdgpu_sync_kfd - sync to KFD fences
>> + *
>> + * @sync: sync object to add KFD fences to
>> + * @resv: reservation object with KFD fences
>> + *
>> + * Extract all KFD fences and add them to the sync object.
>> + */
>> +int amdgpu_sync_kfd(struct amdgpu_sync *sync, struct dma_resv *resv)
>> +{
>> +    struct dma_resv_iter cursor;
>> +    struct dma_fence *f;
>> +    int r = 0;
>> +
>> +    dma_resv_iter_begin(&cursor, resv, DMA_RESV_USAGE_BOOKKEEP);
>> +    dma_resv_for_each_fence_unlocked(&cursor, f) {
>> +        void *fence_owner = amdgpu_sync_get_owner(f);
>> +
>> +        if (fence_owner != AMDGPU_FENCE_OWNER_KFD)
>> +            continue;
>> +
>> +        r = amdgpu_sync_fence(sync, f);
>> +        if (r)
>> +            break;
>> +    }
>> +    dma_resv_iter_end(&cursor);
>> +
>> +    return r;
>> +}
>> +
>>   /* Free the entry back to the slab */
>>   static void amdgpu_sync_entry_free(struct amdgpu_sync_entry *e)
>>   {
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.h 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.h
>> index cf1e9e858efd..e3272dce798d 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.h
>> @@ -51,6 +51,7 @@ int amdgpu_sync_fence(struct amdgpu_sync *sync, 
>> struct dma_fence *f);
>>   int amdgpu_sync_resv(struct amdgpu_device *adev, struct amdgpu_sync 
>> *sync,
>>                struct dma_resv *resv, enum amdgpu_sync_mode mode,
>>                void *owner);
>> +int amdgpu_sync_kfd(struct amdgpu_sync *sync, struct dma_resv *resv);
>>   struct dma_fence *amdgpu_sync_peek_fence(struct amdgpu_sync *sync,
>>                        struct amdgpu_ring *ring);
>>   struct dma_fence *amdgpu_sync_get_fence(struct amdgpu_sync *sync);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> index ba99d428610a..13d429b91327 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> @@ -1168,6 +1168,12 @@ int amdgpu_vm_bo_update(struct amdgpu_device 
>> *adev, struct amdgpu_bo_va *bo_va,
>>                        AMDGPU_SYNC_EQ_OWNER, vm);
>>           if (r)
>>               goto error_free;
>> +        if (bo) {
>> +            r = amdgpu_sync_kfd(&sync, bo->tbo.base.resv);
>> +            if (r)
>> +                goto error_free;
>> +        }
>> +
>>       } else {
>>           struct drm_gem_object *obj = &bo->tbo.base;



More information about the amd-gfx mailing list