[PATCH] drm/amdgpu: Modify the contiguous flags behaviour

Tue Apr 16 07:56:12 UTC 2024

Hi Christian,

On 4/16/2024 1:16 PM, Christian König wrote:
> Am 16.04.24 um 00:02 schrieb Philip Yang:
>> On 2024-04-14 10:57, Arunpravin Paneer Selvam wrote:
>>> Now we have two flags for contiguous VRAM buffer allocation.
>>> If the application request for AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS,
>>> it would set the ttm place TTM_PL_FLAG_CONTIGUOUS flag in the
>>> buffer's placement function.
>>>
>>> This patch will change the default behaviour of the two flags.
>> This change will simplify the KFD best effort contiguous VRAM 
>> allocation, because KFD doesn't need set new GEM_ flag.
>>> When we set AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS
>>> - This means contiguous is not mandatory.
>>
>> AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS used in couple of places. For page 
>> table BO, it is fine as BO size is page size 4K. For 64KB reserved 
>> BOs and F/W size related BOs, do all allocation happen at driver 
>> initialization before the VRAM is fragmented?
>>
>
> Oh, that's a really good point, we need to keep the behavior as is for 
> kernel allocations. Arun can you take care of that?
Sure, I will take care kernel allocations.
Thanks,
Arun.
>
> Thanks,
> Christian.
>
>>> - we will try to allocate the contiguous buffer. Say if the
>>>    allocation fails, we fallback to allocate the individual pages.
>>>
>>> When we setTTM_PL_FLAG_CONTIGUOUS
>>> - This means contiguous allocation is mandatory.
>>> - we are setting this in amdgpu_bo_pin_restricted() before bo 
>>> validation
>>>    and check this flag in the vram manager file.
>>> - if this is set, we should allocate the buffer pages contiguously.
>>>    the allocation fails, we return -ENOSPC.
>>>
>>> Signed-off-by: Arunpravin Paneer 
>>> Selvam<Arunpravin.PaneerSelvam at amd.com>
>>> Suggested-by: Christian König<christian.koenig at amd.com>
>>> ---
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_object.c   | 14 +++--
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 57 
>>> +++++++++++++++-----
>>>   2 files changed, 49 insertions(+), 22 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>>> index 8bc79924d171..41926d631563 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>>> @@ -153,8 +153,6 @@ void amdgpu_bo_placement_from_domain(struct 
>>> amdgpu_bo *abo, u32 domain)
>>>           else
>>>               places[c].flags |= TTM_PL_FLAG_TOPDOWN;
>>>   -        if (flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS)
>>> -            places[c].flags |= TTM_PL_FLAG_CONTIGUOUS;
>>>           c++;
>>>       }
>>>   @@ -899,6 +897,8 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo 
>>> *bo, u32 domain,
>>>   {
>>>       struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev);
>>>       struct ttm_operation_ctx ctx = { false, false };
>>> +    struct ttm_place *places = bo->placements;
>>> +    u32 c = 0;
>>>       int r, i;
>>>         if (amdgpu_ttm_tt_get_usermm(bo->tbo.ttm))
>>> @@ -921,16 +921,10 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo 
>>> *bo, u32 domain,
>>>         if (bo->tbo.pin_count) {
>>>           uint32_t mem_type = bo->tbo.resource->mem_type;
>>> -        uint32_t mem_flags = bo->tbo.resource->placement;
>>>             if (!(domain & amdgpu_mem_type_to_domain(mem_type)))
>>>               return -EINVAL;
>>>   -        if ((mem_type == TTM_PL_VRAM) &&
>>> -            (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) &&
>>> -            !(mem_flags & TTM_PL_FLAG_CONTIGUOUS))
>>> -            return -EINVAL;
>>> -
>> This looks like a bug before, but with this patch, the check makes 
>> sense and is needed.
>>>           ttm_bo_pin(&bo->tbo);
>>>             if (max_offset != 0) {
>>> @@ -968,6 +962,10 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo 
>>> *bo, u32 domain,
>>>               bo->placements[i].lpfn = lpfn;
>>>       }
>>>   +    if (domain & AMDGPU_GEM_DOMAIN_VRAM &&
>>> +        !WARN_ON(places[c].mem_type != TTM_PL_VRAM))
>>> +        places[c].flags |= TTM_PL_FLAG_CONTIGUOUS;
>>> +
>>
>> If BO pinned is not allocated with AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS, 
>> should pin and return scattered pages because the RDMA support 
>> scattered dmabuf. Christian also pointed this out.
>>
>> If (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS &&
>>
>>     bo->placements[i].mem_type == TTM_PL_VRAM)
>>         o->placements[i].flags |= TTM_PL_FLAG_CONTIGUOUS;
>>>       r = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
>>>       if (unlikely(r)) {
>>>           dev_err(adev->dev, "%p pin failed\n", bo);
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
>>> index 8db880244324..ddbf302878f6 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
>>> @@ -88,6 +88,30 @@ static inline u64 
>>> amdgpu_vram_mgr_blocks_size(struct list_head *head)
>>>       return size;
>>>   }
>>>   +static inline unsigned long
>>> +amdgpu_vram_find_pages_per_block(struct ttm_buffer_object *tbo,
>>> +                 const struct ttm_place *place,
>>> +                 unsigned long bo_flags)
>>> +{
>>> +    unsigned long pages_per_block;
>>> +
>>> +    if (bo_flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS ||
>>> +        place->flags & TTM_PL_FLAG_CONTIGUOUS) {
>>> +        pages_per_block = ~0ul;
>>> +    } else {
>>> +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
>>> +        pages_per_block = HPAGE_PMD_NR;
>>> +#else
>>> +        /* default to 2MB */
>>> +        pages_per_block = 2UL << (20UL - PAGE_SHIFT);
>>> +#endif
>>> +        pages_per_block = max_t(uint32_t, pages_per_block,
>>> +                    tbo->page_alignment);
>>> +    }
>>> +
>>> +    return pages_per_block;
>>> +}
>>> +
>>>   /**
>>>    * DOC: mem_info_vram_total
>>>    *
>>> @@ -451,8 +475,10 @@ static int amdgpu_vram_mgr_new(struct 
>>> ttm_resource_manager *man,
>>>       struct amdgpu_vram_mgr *mgr = to_vram_mgr(man);
>>>       struct amdgpu_device *adev = to_amdgpu_device(mgr);
>>>       u64 vis_usage = 0, max_bytes, min_block_size;
>>> +    struct amdgpu_bo *bo = ttm_to_amdgpu_bo(tbo);
>>>       struct amdgpu_vram_mgr_resource *vres;
>>>       u64 size, remaining_size, lpfn, fpfn;
>>> +    unsigned long bo_flags = bo->flags;
>>>       struct drm_buddy *mm = &mgr->mm;
>>>       struct drm_buddy_block *block;
>>>       unsigned long pages_per_block;
>>> @@ -468,18 +494,8 @@ static int amdgpu_vram_mgr_new(struct 
>>> ttm_resource_manager *man,
>>>       if (tbo->type != ttm_bo_type_kernel)
>>>           max_bytes -= AMDGPU_VM_RESERVED_VRAM;
>>>   -    if (place->flags & TTM_PL_FLAG_CONTIGUOUS) {
>>> -        pages_per_block = ~0ul;
>>> -    } else {
>>> -#ifdef CONFIG_TRANSPARENT_HUGEPAGE
>>> -        pages_per_block = HPAGE_PMD_NR;
>>> -#else
>>> -        /* default to 2MB */
>>> -        pages_per_block = 2UL << (20UL - PAGE_SHIFT);
>>> -#endif
>>> -        pages_per_block = max_t(uint32_t, pages_per_block,
>>> -                    tbo->page_alignment);
>>> -    }
>>> +    pages_per_block =
>>> +        amdgpu_vram_find_pages_per_block(tbo, place, bo_flags);
>>>         vres = kzalloc(sizeof(*vres), GFP_KERNEL);
>>>       if (!vres)
>>> @@ -498,7 +514,8 @@ static int amdgpu_vram_mgr_new(struct 
>>> ttm_resource_manager *man,
>>>       if (place->flags & TTM_PL_FLAG_TOPDOWN)
>>>           vres->flags |= DRM_BUDDY_TOPDOWN_ALLOCATION;
>>>   -    if (place->flags & TTM_PL_FLAG_CONTIGUOUS)
>>> +    if (bo_flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS ||
>>> +        place->flags & TTM_PL_FLAG_CONTIGUOUS)
>>>           vres->flags |= DRM_BUDDY_CONTIGUOUS_ALLOCATION;
>>>         if (fpfn || lpfn != mgr->mm.size)
>>> @@ -529,8 +546,20 @@ static int amdgpu_vram_mgr_new(struct 
>>> ttm_resource_manager *man,
>>>                          min_block_size,
>>>                          &vres->blocks,
>>>                          vres->flags);
>>> -        if (unlikely(r))
>>> +        if (unlikely(r)) {
>>
>> If pin BO failed to allocate contiguous VRAM, this should return 
>> failure to application, as RDMA cannot work with scattered VRAM pages.
>>
>> Regards,
>>
>> Philip
>>
>>> +            if (bo_flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) {
>>> +                /* Fallback to non contiguous allocation */
>>> +                vres->flags &= ~DRM_BUDDY_CONTIGUOUS_ALLOCATION;
>>> +                bo_flags &= ~AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;
>>> +
>>> +                pages_per_block =
>>> +                    amdgpu_vram_find_pages_per_block(tbo,
>>> +                                     place,
>>> +                                     bo_flags);
>>> +                continue;
>>> +            }
>>>               goto error_free_blocks;
>>> +        }
>>>             if (size > remaining_size)
>>>               remaining_size = 0;
>