[PATCH] drm/amdgpu: further lower VRAM allocation overhead

Wed Jul 14 12:08:52 UTC 2021

Am 13.07.21 um 18:11 schrieb Felix Kuehling:
> Am 2021-07-13 um 9:32 a.m. schrieb Christian König:
>> For allocations larger than 48MiB we need more than a page for the
>> housekeeping in the worst case resulting in the usual vmalloc overhead.
>>
>> Try to avoid this by assuming the good case and only falling back to the
>> worst case if this didn't worked.
>>
>> Signed-off-by: Christian König <christian.koenig at amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 80 +++++++++++++++-----
>>   1 file changed, 60 insertions(+), 20 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
>> index 2fd77c36a1ff..ab8c5e28df7b 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
>> @@ -361,19 +361,23 @@ static void amdgpu_vram_mgr_virt_start(struct ttm_resource *mem,
>>    * @man: TTM memory type manager
>>    * @tbo: TTM BO we need this range for
>>    * @place: placement flags and restrictions
>> - * @mem: the resulting mem object
>> + * @num_nodes: number of page nodes to use.
>> + * @pages_per_node: number of pages per node to use.
>> + * @res: the resulting mem object
>>    *
>>    * Allocate VRAM for the given BO.
>>    */
>>   static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man,
>>   			       struct ttm_buffer_object *tbo,
>>   			       const struct ttm_place *place,
>> +			       unsigned long num_nodes,
>> +			       unsigned long pages_per_node,
>>   			       struct ttm_resource **res)
>>   {
>> -	unsigned long lpfn, num_nodes, pages_per_node, pages_left, pages;
>>   	struct amdgpu_vram_mgr *mgr = to_vram_mgr(man);
>>   	struct amdgpu_device *adev = to_amdgpu_device(mgr);
>>   	uint64_t vis_usage = 0, mem_bytes, max_bytes;
>> +	unsigned long lpfn, pages_left, pages;
>>   	struct ttm_range_mgr_node *node;
>>   	struct drm_mm *mm = &mgr->mm;
>>   	enum drm_mm_insert_mode mode;
>> @@ -395,21 +399,6 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man,
>>   		goto error_sub;
>>   	}
>>   
>> -	if (place->flags & TTM_PL_FLAG_CONTIGUOUS) {
>> -		pages_per_node = ~0ul;
>> -		num_nodes = 1;
>> -	} else {
>> -#ifdef CONFIG_TRANSPARENT_HUGEPAGE
>> -		pages_per_node = HPAGE_PMD_NR;
>> -#else
>> -		/* default to 2MB */
>> -		pages_per_node = 2UL << (20UL - PAGE_SHIFT);
>> -#endif
>> -		pages_per_node = max_t(uint32_t, pages_per_node,
>> -				       tbo->page_alignment);
>> -		num_nodes = DIV_ROUND_UP_ULL(PFN_UP(mem_bytes), pages_per_node);
>> -	}
>> -
>>   	node = kvmalloc(struct_size(node, mm_nodes, num_nodes),
>>   			GFP_KERNEL | __GFP_ZERO);
>>   	if (!node) {
>> @@ -431,10 +420,15 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man,
>>   	i = 0;
>>   	spin_lock(&mgr->lock);
>>   	while (pages_left) {
>> -		uint32_t alignment = tbo->page_alignment;
>> +		unsigned long alignment = tbo->page_alignment;
>> +
>> +		if (i >= num_nodes) {
>> +			r = -E2BIG;
>> +			goto error_free;
>> +		}
>>   
>>   		if (pages >= pages_per_node)
>> -			alignment = pages_per_node;
>> +			alignment = max(alignment, pages_per_node);
> I don't understand this change. Is this an unrelated fix? pages_per_node
> is already bumped up to tbo->page_alignment in amdgpu_vram_mgr_alloc. So
> this "max" operation here seems redundant.

Oh, yes good point. I've totally missed that and was wondering why this 
here isn't a problem.

>
> Other than that, the patch is
>
> Reviewed-by: Felix Kuehling <Felix.Kuehling at amd.com>

Thanks,
Christian.

>
> @JinHuiEric, can you confirm the performance improvement?
>
> Thanks,
>    Felix
>
>
>>   
>>   		r = drm_mm_insert_node_in_range(mm, &node->mm_nodes[i], pages,
>>   						alignment, 0, place->fpfn,
>> @@ -483,6 +477,52 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man,
>>   	return r;
>>   }
>>   
>> +/**
>> + * amdgpu_vram_mgr_alloc - allocate new range
>> + *
>> + * @man: TTM memory type manager
>> + * @tbo: TTM BO we need this range for
>> + * @place: placement flags and restrictions
>> + * @res: the resulting mem object
>> + *
>> + * Allocate VRAM for the given BO.
>> + */
>> +static int amdgpu_vram_mgr_alloc(struct ttm_resource_manager *man,
>> +				 struct ttm_buffer_object *tbo,
>> +				 const struct ttm_place *place,
>> +				 struct ttm_resource **res)
>> +{
>> +	unsigned long num_nodes, pages_per_node;
>> +	struct ttm_range_mgr_node *node;
>> +	int r;
>> +
>> +	if (place->flags & TTM_PL_FLAG_CONTIGUOUS)
>> +		return amdgpu_vram_mgr_new(man, tbo, place, 1, ~0ul, res);
>> +
>> +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
>> +	pages_per_node = HPAGE_PMD_NR;
>> +#else
>> +	/* default to 2MB */
>> +	pages_per_node = 2UL << (20UL - PAGE_SHIFT);
>> +#endif
>> +	pages_per_node = max_t(uint32_t, pages_per_node, tbo->page_alignment);
>> +	num_nodes = DIV_ROUND_UP_ULL(PFN_UP(tbo->base.size), pages_per_node);
>> +
>> +	if (struct_size(node, mm_nodes, num_nodes) > PAGE_SIZE) {
>> +		size_t size = PAGE_SIZE;
>> +
>> +		size -= sizeof(struct ttm_range_mgr_node);
>> +		size /= sizeof(struct drm_mm_node);
>> +		r = amdgpu_vram_mgr_new(man, tbo, place, size, pages_per_node,
>> +					res);
>> +		if (r != -E2BIG)
>> +			return r;
>> +	}
>> +
>> +	return amdgpu_vram_mgr_new(man, tbo, place, num_nodes, pages_per_node,
>> +				   res);
>> +}
>> +
>>   /**
>>    * amdgpu_vram_mgr_del - free ranges
>>    *
>> @@ -680,7 +720,7 @@ static void amdgpu_vram_mgr_debug(struct ttm_resource_manager *man,
>>   }
>>   
>>   static const struct ttm_resource_manager_func amdgpu_vram_mgr_func = {
>> -	.alloc	= amdgpu_vram_mgr_new,
>> +	.alloc	= amdgpu_vram_mgr_alloc,
>>   	.free	= amdgpu_vram_mgr_del,
>>   	.debug	= amdgpu_vram_mgr_debug
>>   };