[PATCH] drm/amdgpu: Add a low priority scheduler for VRAM clearing

Fri May 19 01:09:00 UTC 2023

On 5/17/2023 5:10 PM, Felix Kuehling wrote:
> Caution: This message originated from an External Source. Use proper 
> caution when opening attachments, clicking links, or responding.
>
>
> On 2023-05-17 17:40, Mukul Joshi wrote:
>> Add a low priority DRM scheduler for VRAM clearing instead of using
>> the exisiting high priority scheduler. Use the high priority scheduler
>> for migrations and evictions.
>>
>> Signed-off-by: Mukul Joshi <mukul.joshi at amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_object.c |  4 +--
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c    | 37 ++++++++++++++++------
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h    |  5 ++-
>>   3 files changed, 34 insertions(+), 12 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>> index 9924c8de57ab..46f249912b67 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>> @@ -627,7 +627,7 @@ int amdgpu_bo_create(struct amdgpu_device *adev,
>>           bo->tbo.resource->mem_type == TTM_PL_VRAM) {
>>               struct dma_fence *fence;
>>
>> -             r = amdgpu_fill_buffer(bo, 0, bo->tbo.base.resv, &fence);
>> +             r = amdgpu_fill_buffer(bo, 0, bo->tbo.base.resv, 
>> &fence, true);
>>               if (unlikely(r))
>>                       goto fail_unreserve;
>>
>> @@ -1354,7 +1354,7 @@ void amdgpu_bo_release_notify(struct 
>> ttm_buffer_object *bo)
>>       if (WARN_ON_ONCE(!dma_resv_trylock(bo->base.resv)))
>>               return;
>>
>> -     r = amdgpu_fill_buffer(abo, AMDGPU_POISON, bo->base.resv, &fence);
>> +     r = amdgpu_fill_buffer(abo, AMDGPU_POISON, bo->base.resv, 
>> &fence, true);
>>       if (!WARN_ON(r)) {
>>               amdgpu_bo_fence(abo, fence, false);
>>               dma_fence_put(fence);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>> index 34724b771ace..bbdad0dc1b07 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>> @@ -384,7 +384,8 @@ static int amdgpu_move_blit(struct 
>> ttm_buffer_object *bo,
>>           (abo->flags & AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE)) {
>>               struct dma_fence *wipe_fence = NULL;
>>
>> -             r = amdgpu_fill_buffer(abo, AMDGPU_POISON, NULL, 
>> &wipe_fence);
>> +             r = amdgpu_fill_buffer(abo, AMDGPU_POISON, NULL, 
>> &wipe_fence,
>> +                                     false);
>>               if (r) {
>>                       goto error;
>>               } else if (wipe_fence) {
>> @@ -2040,8 +2041,18 @@ void amdgpu_ttm_set_buffer_funcs_status(struct 
>> amdgpu_device *adev, bool enable)
>>                                 r);
>>                       return;
>>               }
>> +
>> +             r = drm_sched_entity_init(&adev->mman.delayed,
>
> I'm not a big fan of the "delayed" name. But I see the parallel with
> "immediate" and "delayed" entities in amdgpu_vm. We may want to rename
> adev->mman.entity to "immediate" as well to make the distinction clearer.
>
>
>> + DRM_SCHED_PRIORITY_NORMAL, &sched,
>
> I wonder if this could even be DRM_SCHED_PRIORITY_MIN to minimize its
> performance impact on page table updates and graphics user mode
> submissions. Either way, the patch is
>
> Acked-by: Felix Kuehling <Felix.Kuehling at amd.com>
>
> Thanks,
>   Felix
>
I think you may make it more general: not jut two drm_sched_entity at 
adev->mma:

struct amdgpu_mman {
....
  struct drm_sched_entity entity[DRM_SCHED_PRIORITY_COUNT];

....

}

then drm_sched_entity_init each adev->mman.entity[i] at 
amdgpu_ttm_set_buffer_funcs_status. So you can use adev->mma.entity[i] 
for different usage, like adev->mma.entity[DRM_SCHED_PRIORITY_KERNEL] 
for svm migration, adev->mma.entity[DRM_SCHED_PRIORITY_NORMAL] for 
buffer cleaning during alloc/free,  
adev->mma.entity[DRM_SCHED_PRIORITY_MIN] for page table update, etc. 
That will handle different buffer copies with different priorities, led 
to performance improvement.

>
>> +                                       1, NULL);
>> +             if (r) {
>> +                     DRM_ERROR("Failed setting up TTM BO move entity 
>> (%d)\n",
>> +                               r);
>> +                     goto error_free_entity;
>> +             }
>>       } else {
>> drm_sched_entity_destroy(&adev->mman.entity);
>> + drm_sched_entity_destroy(&adev->mman.delayed);
>>               dma_fence_put(man->move);
>>               man->move = NULL;
>>       }
>> @@ -2053,6 +2064,11 @@ void amdgpu_ttm_set_buffer_funcs_status(struct 
>> amdgpu_device *adev, bool enable)
>>               size = adev->gmc.visible_vram_size;
>>       man->size = size;
>>       adev->mman.buffer_funcs_enabled = enable;
>> +
>> +     return;
>> +
>> +error_free_entity:
>> +     drm_sched_entity_destroy(&adev->mman.entity);
>>   }
>>
>>   static int amdgpu_ttm_prepare_job(struct amdgpu_device *adev,
>> @@ -2060,14 +2076,16 @@ static int amdgpu_ttm_prepare_job(struct 
>> amdgpu_device *adev,
>>                                 unsigned int num_dw,
>>                                 struct dma_resv *resv,
>>                                 bool vm_needs_flush,
>> -                               struct amdgpu_job **job)
>> +                               struct amdgpu_job **job,
>> +                               bool delayed)
>>   {
>>       enum amdgpu_ib_pool_type pool = direct_submit ?
>>               AMDGPU_IB_POOL_DIRECT :
>>               AMDGPU_IB_POOL_DELAYED;
>>       int r;
>> -
>> -     r = amdgpu_job_alloc_with_ib(adev, &adev->mman.entity,
>> +     struct drm_sched_entity *entity = delayed ? &adev->mman.delayed :
>> + &adev->mman.entity;
>> +     r = amdgpu_job_alloc_with_ib(adev, entity,
>>                                    AMDGPU_FENCE_OWNER_UNDEFINED,
>>                                    num_dw * 4, pool, job);
>>       if (r)
>> @@ -2108,7 +2126,7 @@ int amdgpu_copy_buffer(struct amdgpu_ring 
>> *ring, uint64_t src_offset,
>>       num_loops = DIV_ROUND_UP(byte_count, max_bytes);
>>       num_dw = ALIGN(num_loops * 
>> adev->mman.buffer_funcs->copy_num_dw, 8);
>>       r = amdgpu_ttm_prepare_job(adev, direct_submit, num_dw,
>> -                                resv, vm_needs_flush, &job);
>> +                                resv, vm_needs_flush, &job, false);
>>       if (r)
>>               return r;
>>
>> @@ -2144,7 +2162,7 @@ static int amdgpu_ttm_fill_mem(struct 
>> amdgpu_ring *ring, uint32_t src_data,
>>                              uint64_t dst_addr, uint32_t byte_count,
>>                              struct dma_resv *resv,
>>                              struct dma_fence **fence,
>> -                            bool vm_needs_flush)
>> +                            bool vm_needs_flush, bool delayed)
>>   {
>>       struct amdgpu_device *adev = ring->adev;
>>       unsigned int num_loops, num_dw;
>> @@ -2157,7 +2175,7 @@ static int amdgpu_ttm_fill_mem(struct 
>> amdgpu_ring *ring, uint32_t src_data,
>>       num_loops = DIV_ROUND_UP_ULL(byte_count, max_bytes);
>>       num_dw = ALIGN(num_loops * 
>> adev->mman.buffer_funcs->fill_num_dw, 8);
>>       r = amdgpu_ttm_prepare_job(adev, false, num_dw, resv, 
>> vm_needs_flush,
>> -                                &job);
>> +                                &job, delayed);
>>       if (r)
>>               return r;
>>
>> @@ -2180,7 +2198,8 @@ static int amdgpu_ttm_fill_mem(struct 
>> amdgpu_ring *ring, uint32_t src_data,
>>   int amdgpu_fill_buffer(struct amdgpu_bo *bo,
>>                       uint32_t src_data,
>>                       struct dma_resv *resv,
>> -                     struct dma_fence **f)
>> +                     struct dma_fence **f,
>> +                     bool delayed)
>>   {
>>       struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev);
>>       struct amdgpu_ring *ring = adev->mman.buffer_funcs_ring;
>> @@ -2209,7 +2228,7 @@ int amdgpu_fill_buffer(struct amdgpu_bo *bo,
>>                       goto error;
>>
>>               r = amdgpu_ttm_fill_mem(ring, src_data, to, cur_size, 
>> resv,
>> -                                     &next, true);
>> +                                     &next, true, delayed);
>>               if (r)
>>                       goto error;
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
>> index 8ef048a0a33e..e82b1edee7a4 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
>> @@ -61,6 +61,8 @@ struct amdgpu_mman {
>>       struct mutex                            gtt_window_lock;
>>       /* Scheduler entity for buffer moves */
>>       struct drm_sched_entity                 entity;
>> +     /* Scheduler entity for VRAM clearing */
>> +     struct drm_sched_entity                 delayed;
>>
>>       struct amdgpu_vram_mgr vram_mgr;
>>       struct amdgpu_gtt_mgr gtt_mgr;
>> @@ -152,7 +154,8 @@ int amdgpu_ttm_copy_mem_to_mem(struct 
>> amdgpu_device *adev,
>>   int amdgpu_fill_buffer(struct amdgpu_bo *bo,
>>                       uint32_t src_data,
>>                       struct dma_resv *resv,
>> -                     struct dma_fence **fence);
>> +                     struct dma_fence **fence,
>> +                     bool delayed);
>>
>>   int amdgpu_ttm_alloc_gart(struct ttm_buffer_object *bo);
>>   void amdgpu_ttm_recover_gart(struct ttm_buffer_object *tbo);