[PATCH] drm/amdgpu: restructure amdgpu_vram_mgr_new

Mon Apr 26 16:33:38 UTC 2021

Am 2021-04-26 um 12:12 p.m. schrieb Christian König:
>
>
> Am 26.04.21 um 18:02 schrieb Felix Kuehling:
>> Am 2021-04-26 um 4:54 a.m. schrieb Christian König:
>>> Merge the two loops, loosen the restriction for big allocations.
>>> This reduces the CPU overhead in the good case, but increases
>>> it a bit under memory pressure.
>>>
>>> Signed-off-by: Christian König <christian.koenig at amd.com>
>>> ---
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 58
>>> +++++++++-----------
>>>   1 file changed, 27 insertions(+), 31 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
>>> index 529c5c32a205..e2cbe19404c0 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
>>> @@ -358,13 +358,13 @@ static int amdgpu_vram_mgr_new(struct
>>> ttm_resource_manager *man,
>>>                      const struct ttm_place *place,
>>>                      struct ttm_resource *mem)
>>>   {
>>> +    unsigned long lpfn, num_nodes, pages_per_node, pages_left, pages;
>>>       struct amdgpu_vram_mgr *mgr = to_vram_mgr(man);
>>>       struct amdgpu_device *adev = to_amdgpu_device(mgr);
>>> +    uint64_t vis_usage = 0, mem_bytes, max_bytes;
>>>       struct drm_mm *mm = &mgr->mm;
>>> -    struct drm_mm_node *nodes;
>>>       enum drm_mm_insert_mode mode;
>>> -    unsigned long lpfn, num_nodes, pages_per_node, pages_left;
>>> -    uint64_t vis_usage = 0, mem_bytes, max_bytes;
>>> +    struct drm_mm_node *nodes;
>>>       unsigned i;
>>>       int r;
>>>   @@ -391,9 +391,10 @@ static int amdgpu_vram_mgr_new(struct
>>> ttm_resource_manager *man,
>>>           pages_per_node = HPAGE_PMD_NR;
>>>   #else
>>>           /* default to 2MB */
>>> -        pages_per_node = (2UL << (20UL - PAGE_SHIFT));
>>> +        pages_per_node = 2UL << (20UL - PAGE_SHIFT);
>>>   #endif
>>> -        pages_per_node = max((uint32_t)pages_per_node,
>>> mem->page_alignment);
>>> +        pages_per_node = max_t(uint32_t, pages_per_node,
>>> +                       mem->page_alignment);
>>>           num_nodes = DIV_ROUND_UP(mem->num_pages, pages_per_node);
>>>       }
>>>   @@ -411,42 +412,37 @@ static int amdgpu_vram_mgr_new(struct
>>> ttm_resource_manager *man,
>>>       mem->start = 0;
>>>       pages_left = mem->num_pages;
>>>   -    spin_lock(&mgr->lock);
>>> -    for (i = 0; pages_left >= pages_per_node; ++i) {
>>> -        unsigned long pages = rounddown_pow_of_two(pages_left);
>>> -
>>> -        /* Limit maximum size to 2GB due to SG table limitations */
>>> -        pages = min(pages, (2UL << (30 - PAGE_SHIFT)));
>>> +    /* Limit maximum size to 2GB due to SG table limitations */
>>> +    pages = min(pages_left, 2UL << (30 - PAGE_SHIFT));
>>>   -        r = drm_mm_insert_node_in_range(mm, &nodes[i], pages,
>>> -                        pages_per_node, 0,
>>> -                        place->fpfn, lpfn,
>>> -                        mode);
>>> -        if (unlikely(r))
>>> -            break;
>>> -
>>> -        vis_usage += amdgpu_vram_mgr_vis_size(adev, &nodes[i]);
>>> -        amdgpu_vram_mgr_virt_start(mem, &nodes[i]);
>>> -        pages_left -= pages;
>>> -    }
>>> -
>>> -    for (; pages_left; ++i) {
>>> -        unsigned long pages = min(pages_left, pages_per_node);
>>> +    i = 0;
>>> +    spin_lock(&mgr->lock);
>>> +    while (pages_left) {
>>>           uint32_t alignment = mem->page_alignment;
>>>   -        if (pages == pages_per_node)
>>> +        if (pages >= pages_per_node)
>>>               alignment = pages_per_node;
>>>   -        r = drm_mm_insert_node_in_range(mm, &nodes[i],
>>> -                        pages, alignment, 0,
>>> -                        place->fpfn, lpfn,
>>> -                        mode);
>>> -        if (unlikely(r))
>>> +        r = drm_mm_insert_node_in_range(mm, &nodes[i], pages,
>>> alignment,
>>> +                        0, place->fpfn, lpfn, mode);
>>> +        if (unlikely(r)) {
>>> +            if (pages > pages_per_node) {
>> This means we can never allocate chunks smaller than 2MB, except for the
>> tail. And the tail still needs to be allocated in one piece if it's <
>> 2MB.
>
> Correct, but that was the behavior before as well.
>
>> On the other hand, we should not allow allocations smaller than
>> mem->page_alignment, except for the tail. So should this condition be
>> "if (pages > mem->page_alignment)" to allow maximum flexibility for
>> allocations without physical alignment constraints when memory is very
>> fragmented?
>
> See a few lines above:
>
> pages_per_node = max_t(uint32_t, pages_per_node, page_alignment);
>
> So pages_per_node is always larger than page_alignment and we actually
> can't allocate less than pages_per_node in one allocation or we would
> overflow the nodes array.

Makes sense. The patch is

Reviewed-by: Felix Kuehling <Felix.Kuehling at amd.com>

>
> Regards,
> Christian.
>
>>
>> Regards,
>>    Felix
>>
>>
>>> +                if (is_power_of_2(pages))
>>> +                    pages = pages / 2;
>>> +                else
>>> +                    pages = rounddown_pow_of_two(pages);
>>> +                continue;
>>> +            }
>>>               goto error;
>>> +        }
>>>             vis_usage += amdgpu_vram_mgr_vis_size(adev, &nodes[i]);
>>>           amdgpu_vram_mgr_virt_start(mem, &nodes[i]);
>>>           pages_left -= pages;
>>> +        ++i;
>>> +
>>> +        if (pages > pages_left)
>>> +            pages = pages_left;
>>>       }
>>>       spin_unlock(&mgr->lock);
>>>   
>