[PATCH] drm/amdgpu: Add a low priority scheduler for VRAM clearing
Christian König
christian.koenig at amd.com
Tue May 30 18:05:31 UTC 2023
Am 19.05.23 um 03:09 schrieb Chen, Xiaogang:
>
> On 5/17/2023 5:10 PM, Felix Kuehling wrote:
>> Caution: This message originated from an External Source. Use proper
>> caution when opening attachments, clicking links, or responding.
>>
>>
>> On 2023-05-17 17:40, Mukul Joshi wrote:
>>> Add a low priority DRM scheduler for VRAM clearing instead of using
>>> the exisiting high priority scheduler. Use the high priority scheduler
>>> for migrations and evictions.
>>>
>>> Signed-off-by: Mukul Joshi <mukul.joshi at amd.com>
>>> ---
>>> drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 4 +--
>>> drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 37
>>> ++++++++++++++++------
>>> drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h | 5 ++-
>>> 3 files changed, 34 insertions(+), 12 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>>> index 9924c8de57ab..46f249912b67 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>>> @@ -627,7 +627,7 @@ int amdgpu_bo_create(struct amdgpu_device *adev,
>>> bo->tbo.resource->mem_type == TTM_PL_VRAM) {
>>> struct dma_fence *fence;
>>>
>>> - r = amdgpu_fill_buffer(bo, 0, bo->tbo.base.resv, &fence);
>>> + r = amdgpu_fill_buffer(bo, 0, bo->tbo.base.resv,
>>> &fence, true);
>>> if (unlikely(r))
>>> goto fail_unreserve;
>>>
>>> @@ -1354,7 +1354,7 @@ void amdgpu_bo_release_notify(struct
>>> ttm_buffer_object *bo)
>>> if (WARN_ON_ONCE(!dma_resv_trylock(bo->base.resv)))
>>> return;
>>>
>>> - r = amdgpu_fill_buffer(abo, AMDGPU_POISON, bo->base.resv,
>>> &fence);
>>> + r = amdgpu_fill_buffer(abo, AMDGPU_POISON, bo->base.resv,
>>> &fence, true);
>>> if (!WARN_ON(r)) {
>>> amdgpu_bo_fence(abo, fence, false);
>>> dma_fence_put(fence);
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>>> index 34724b771ace..bbdad0dc1b07 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>>> @@ -384,7 +384,8 @@ static int amdgpu_move_blit(struct
>>> ttm_buffer_object *bo,
>>> (abo->flags & AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE)) {
>>> struct dma_fence *wipe_fence = NULL;
>>>
>>> - r = amdgpu_fill_buffer(abo, AMDGPU_POISON, NULL,
>>> &wipe_fence);
>>> + r = amdgpu_fill_buffer(abo, AMDGPU_POISON, NULL,
>>> &wipe_fence,
>>> + false);
>>> if (r) {
>>> goto error;
>>> } else if (wipe_fence) {
>>> @@ -2040,8 +2041,18 @@ void
>>> amdgpu_ttm_set_buffer_funcs_status(struct amdgpu_device *adev, bool
>>> enable)
>>> r);
>>> return;
>>> }
>>> +
>>> + r = drm_sched_entity_init(&adev->mman.delayed,
>>
>> I'm not a big fan of the "delayed" name. But I see the parallel with
>> "immediate" and "delayed" entities in amdgpu_vm. We may want to rename
>> adev->mman.entity to "immediate" as well to make the distinction
>> clearer.
Well NAK. The delayed and immediate names in the VM have a completely
different meaning than here (e.g. memory management restrictions).
I think we should just name them high and low priority.
>>
>>
>>> + DRM_SCHED_PRIORITY_NORMAL, &sched,
>>
>> I wonder if this could even be DRM_SCHED_PRIORITY_MIN to minimize its
>> performance impact on page table updates and graphics user mode
>> submissions. Either way, the patch is
>>
>> Acked-by: Felix Kuehling <Felix.Kuehling at amd.com>
>>
>> Thanks,
>> Felix
>>
> I think you may make it more general: not jut two drm_sched_entity at
> adev->mma:
>
> struct amdgpu_mman {
> ....
> struct drm_sched_entity entity[DRM_SCHED_PRIORITY_COUNT];
Please don't. Every entity comes with a small overhead and we really
just need the high and low priority I think.
Regards,
Christian.
>
> ....
>
> }
>
> then drm_sched_entity_init each adev->mman.entity[i] at
> amdgpu_ttm_set_buffer_funcs_status. So you can use adev->mma.entity[i]
> for different usage, like adev->mma.entity[DRM_SCHED_PRIORITY_KERNEL]
> for svm migration, adev->mma.entity[DRM_SCHED_PRIORITY_NORMAL] for
> buffer cleaning during alloc/free,
> adev->mma.entity[DRM_SCHED_PRIORITY_MIN] for page table update, etc.
> That will handle different buffer copies with different priorities,
> led to performance improvement.
>
>>
>>> + 1, NULL);
>>> + if (r) {
>>> + DRM_ERROR("Failed setting up TTM BO move
>>> entity (%d)\n",
>>> + r);
>>> + goto error_free_entity;
>>> + }
>>> } else {
>>> drm_sched_entity_destroy(&adev->mman.entity);
>>> + drm_sched_entity_destroy(&adev->mman.delayed);
>>> dma_fence_put(man->move);
>>> man->move = NULL;
>>> }
>>> @@ -2053,6 +2064,11 @@ void
>>> amdgpu_ttm_set_buffer_funcs_status(struct amdgpu_device *adev, bool
>>> enable)
>>> size = adev->gmc.visible_vram_size;
>>> man->size = size;
>>> adev->mman.buffer_funcs_enabled = enable;
>>> +
>>> + return;
>>> +
>>> +error_free_entity:
>>> + drm_sched_entity_destroy(&adev->mman.entity);
>>> }
>>>
>>> static int amdgpu_ttm_prepare_job(struct amdgpu_device *adev,
>>> @@ -2060,14 +2076,16 @@ static int amdgpu_ttm_prepare_job(struct
>>> amdgpu_device *adev,
>>> unsigned int num_dw,
>>> struct dma_resv *resv,
>>> bool vm_needs_flush,
>>> - struct amdgpu_job **job)
>>> + struct amdgpu_job **job,
>>> + bool delayed)
>>> {
>>> enum amdgpu_ib_pool_type pool = direct_submit ?
>>> AMDGPU_IB_POOL_DIRECT :
>>> AMDGPU_IB_POOL_DELAYED;
>>> int r;
>>> -
>>> - r = amdgpu_job_alloc_with_ib(adev, &adev->mman.entity,
>>> + struct drm_sched_entity *entity = delayed ? &adev->mman.delayed :
>>> + &adev->mman.entity;
>>> + r = amdgpu_job_alloc_with_ib(adev, entity,
>>> AMDGPU_FENCE_OWNER_UNDEFINED,
>>> num_dw * 4, pool, job);
>>> if (r)
>>> @@ -2108,7 +2126,7 @@ int amdgpu_copy_buffer(struct amdgpu_ring
>>> *ring, uint64_t src_offset,
>>> num_loops = DIV_ROUND_UP(byte_count, max_bytes);
>>> num_dw = ALIGN(num_loops *
>>> adev->mman.buffer_funcs->copy_num_dw, 8);
>>> r = amdgpu_ttm_prepare_job(adev, direct_submit, num_dw,
>>> - resv, vm_needs_flush, &job);
>>> + resv, vm_needs_flush, &job, false);
>>> if (r)
>>> return r;
>>>
>>> @@ -2144,7 +2162,7 @@ static int amdgpu_ttm_fill_mem(struct
>>> amdgpu_ring *ring, uint32_t src_data,
>>> uint64_t dst_addr, uint32_t byte_count,
>>> struct dma_resv *resv,
>>> struct dma_fence **fence,
>>> - bool vm_needs_flush)
>>> + bool vm_needs_flush, bool delayed)
>>> {
>>> struct amdgpu_device *adev = ring->adev;
>>> unsigned int num_loops, num_dw;
>>> @@ -2157,7 +2175,7 @@ static int amdgpu_ttm_fill_mem(struct
>>> amdgpu_ring *ring, uint32_t src_data,
>>> num_loops = DIV_ROUND_UP_ULL(byte_count, max_bytes);
>>> num_dw = ALIGN(num_loops *
>>> adev->mman.buffer_funcs->fill_num_dw, 8);
>>> r = amdgpu_ttm_prepare_job(adev, false, num_dw, resv,
>>> vm_needs_flush,
>>> - &job);
>>> + &job, delayed);
>>> if (r)
>>> return r;
>>>
>>> @@ -2180,7 +2198,8 @@ static int amdgpu_ttm_fill_mem(struct
>>> amdgpu_ring *ring, uint32_t src_data,
>>> int amdgpu_fill_buffer(struct amdgpu_bo *bo,
>>> uint32_t src_data,
>>> struct dma_resv *resv,
>>> - struct dma_fence **f)
>>> + struct dma_fence **f,
>>> + bool delayed)
>>> {
>>> struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev);
>>> struct amdgpu_ring *ring = adev->mman.buffer_funcs_ring;
>>> @@ -2209,7 +2228,7 @@ int amdgpu_fill_buffer(struct amdgpu_bo *bo,
>>> goto error;
>>>
>>> r = amdgpu_ttm_fill_mem(ring, src_data, to, cur_size,
>>> resv,
>>> - &next, true);
>>> + &next, true, delayed);
>>> if (r)
>>> goto error;
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
>>> index 8ef048a0a33e..e82b1edee7a4 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
>>> @@ -61,6 +61,8 @@ struct amdgpu_mman {
>>> struct mutex gtt_window_lock;
>>> /* Scheduler entity for buffer moves */
>>> struct drm_sched_entity entity;
>>> + /* Scheduler entity for VRAM clearing */
>>> + struct drm_sched_entity delayed;
>>>
>>> struct amdgpu_vram_mgr vram_mgr;
>>> struct amdgpu_gtt_mgr gtt_mgr;
>>> @@ -152,7 +154,8 @@ int amdgpu_ttm_copy_mem_to_mem(struct
>>> amdgpu_device *adev,
>>> int amdgpu_fill_buffer(struct amdgpu_bo *bo,
>>> uint32_t src_data,
>>> struct dma_resv *resv,
>>> - struct dma_fence **fence);
>>> + struct dma_fence **fence,
>>> + bool delayed);
>>>
>>> int amdgpu_ttm_alloc_gart(struct ttm_buffer_object *bo);
>>> void amdgpu_ttm_recover_gart(struct ttm_buffer_object *tbo);
More information about the amd-gfx
mailing list