[RFC PATCH 07/16] drm/ttm: Reduce the number of used allocation orders for TTM pages

Thu Feb 16 07:11:46 UTC 2023

Am 15.02.23 um 20:00 schrieb Thomas Hellström:
> On Wed, 2023-02-15 at 19:30 +0100, Christian König wrote:
>> Am 15.02.23 um 19:12 schrieb Thomas Hellström:
>>> On Wed, 2023-02-15 at 18:42 +0100, Christian König wrote:
>>>> Am 15.02.23 um 17:13 schrieb Thomas Hellström:
>>>>> When swapping out, we will split multi-order pages both in
>>>>> order to
>>>>> move them to the swap-cache and to be able to return memory to
>>>>> the
>>>>> swap cache as soon as possible on a page-by-page basis.
>>>>> By reducing the page max order to the system PMD size, we can
>>>>> be
>>>>> nicer
>>>>> to the system and avoid splitting gigantic pages.
>>>>> On top of this we also
>>>>> include the 64K page size in the page sizes tried, since that
>>>>> appears to
>>>>> be a common size for GPU applications.
>>>> Please completely drop that.
>>> You mean the 64K page size, or the whole patch?
>> The 64K page size. This was an invention from Microsoft to
>> standardize
>> GPU handling ~15-20years ago.
>>
>> It turned out to be a complete shipwreck and by now 2MiB and 1GiB
>> pages
>> or just flexible hardware which can handle everything seem to become
>> standard.
>>
>>>> This is just nonsense spilling in from the
>>>> Windows drivers.
>>> Agreed, but IIRC on the last RFC you asked me not to drop the 64K
>>> pages, so that's why they are here. I can remove them if needed.
>> We could keep it if it's in any way beneficial, but I'm pretty sure I
>> must have been drunk to ask for that.
>>
>>> The only reason for keeping them from a performance point of view
>>> is
>>> better efficiency on GPUs with 64K page size if not using a
>>> coalescing
>>> IOMMU for dma-mapping.
>> Are any of those still produced? As far as I know neither NVidia,
>> Intel
>> nor AMD still assumes that page size in their hardware for quite a
>> while
>> now.
> Intel still supports 64K PTEs, so we use them where possible, otherwise
> falling back to 4K. Typically we have coalescing IOMMU enabled when
> testing, so can't really see the impact, but TBH I was surprised by the
> number of 64K page allocations TTM spat out with this patch series, so
> I definitely think there is a performance impact with !IOMMU, although
> I can't quantify it ATM.
>
> So then if it's OK with you I'll keep that size for now.

If it makes 64K pages preferred then this is a pretty clear NAK.

What we can do is to support any page size up to at least 2MiB here.

Christian.

>
> /Thomas
>
>
>
>> Regards,
>> Christian.
>>
>>> Let me know what you think is best and I'll adjust accordingly.
>>>
>>> /Thomas
>>>
>>>
>>>> Christian.
>>>>
>>>>> Looking forward to when we might be able to swap out PMD size
>>>>> folios
>>>>> without splitting, this will also be a benefit.
>>>>>
>>>>> Signed-off-by: Thomas Hellström
>>>>> <thomas.hellstrom at linux.intel.com>
>>>>> ---
>>>>>     drivers/gpu/drm/ttm/ttm_pool.c | 58
>>>>> ++++++++++++++++++++++++++---
>>>>> -----
>>>>>     1 file changed, 45 insertions(+), 13 deletions(-)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/ttm/ttm_pool.c
>>>>> b/drivers/gpu/drm/ttm/ttm_pool.c
>>>>> index 1cc7591a9542..8787fb6a218b 100644
>>>>> --- a/drivers/gpu/drm/ttm/ttm_pool.c
>>>>> +++ b/drivers/gpu/drm/ttm/ttm_pool.c
>>>>> @@ -31,6 +31,8 @@
>>>>>      * cause they are rather slow compared to alloc_pages+map.
>>>>>      */
>>>>>     
>>>>> +#define pr_fmt(fmt) "[TTM POOL] " fmt
>>>>> +
>>>>>     #include <linux/module.h>
>>>>>     #include <linux/dma-mapping.h>
>>>>>     #include <linux/debugfs.h>
>>>>> @@ -47,6 +49,18 @@
>>>>>     
>>>>>     #include "ttm_module.h"
>>>>>     
>>>>> +#define TTM_MAX_ORDER (PMD_SHIFT - PAGE_SHIFT)
>>>>> +#define TTM_64K_ORDER (16 - PAGE_SHIFT)
>>>>> +#if (TTM_MAX_ORDER < TTM_64K_ORDER)
>>>>> +#undef TTM_MAX_ORDER
>>>>> +#define TTM_MAX_ORDER TTM_64K_ORDER
>>>>> +#endif
>>>>> +#if ((MAX_ORDER - 1) < TTM_MAX_ORDER)
>>>>> +#undef TTM_MAX_ORDER
>>>>> +#define TTM_MAX_ORDER (MAX_ORDER - 1)
>>>>> +#endif
>>>>> +#define TTM_DIM_ORDER (TTM_MAX_ORDER + 1)
>>>>> +
>>>>>     /**
>>>>>      * struct ttm_pool_dma - Helper object for coherent DMA
>>>>> mappings
>>>>>      *
>>>>> @@ -65,16 +79,18 @@ module_param(page_pool_size, ulong, 0644);
>>>>>     
>>>>>     static atomic_long_t allocated_pages;
>>>>>     
>>>>> -static struct ttm_pool_type global_write_combined[MAX_ORDER];
>>>>> -static struct ttm_pool_type global_uncached[MAX_ORDER];
>>>>> +static struct ttm_pool_type
>>>>> global_write_combined[TTM_DIM_ORDER];
>>>>> +static struct ttm_pool_type global_uncached[TTM_DIM_ORDER];
>>>>>     
>>>>> -static struct ttm_pool_type
>>>>> global_dma32_write_combined[MAX_ORDER];
>>>>> -static struct ttm_pool_type global_dma32_uncached[MAX_ORDER];
>>>>> +static struct ttm_pool_type
>>>>> global_dma32_write_combined[TTM_DIM_ORDER];
>>>>> +static struct ttm_pool_type
>>>>> global_dma32_uncached[TTM_DIM_ORDER];
>>>>>     
>>>>>     static spinlock_t shrinker_lock;
>>>>>     static struct list_head shrinker_list;
>>>>>     static struct shrinker mm_shrinker;
>>>>>     
>>>>> +static unsigned int ttm_pool_orders[] = {TTM_MAX_ORDER, 0, 0};
>>>>> +
>>>>>     /* Allocate pages of size 1 << order with the given
>>>>> gfp_flags */
>>>>>     static struct page *ttm_pool_alloc_page(struct ttm_pool
>>>>> *pool,
>>>>> gfp_t gfp_flags,
>>>>>                                           unsigned int order)
>>>>> @@ -400,6 +416,17 @@ static void __ttm_pool_free(struct
>>>>> ttm_pool
>>>>> *pool, struct ttm_tt *tt,
>>>>>           }
>>>>>     }
>>>>>     
>>>>> +static unsigned int ttm_pool_select_order(unsigned int order,
>>>>> pgoff_t num_pages)
>>>>> +{
>>>>> +       unsigned int *cur_order = ttm_pool_orders;
>>>>> +
>>>>> +       order = min_t(unsigned int, __fls(num_pages), order);
>>>>> +       while (order < *cur_order)
>>>>> +               ++cur_order;
>>>>> +
>>>>> +       return *cur_order;
>>>>> +}
>>>>> +
>>>>>     /**
>>>>>      * ttm_pool_alloc - Fill a ttm_tt object
>>>>>      *
>>>>> @@ -439,9 +466,8 @@ int ttm_pool_alloc(struct ttm_pool *pool,
>>>>> struct ttm_tt *tt,
>>>>>           else
>>>>>                   gfp_flags |= GFP_HIGHUSER;
>>>>>     
>>>>> -       for (order = min_t(unsigned int, MAX_ORDER - 1,
>>>>> __fls(num_pages));
>>>>> -            num_pages;
>>>>> -            order = min_t(unsigned int, order,
>>>>> __fls(num_pages)))
>>>>> {
>>>>> +       order = ttm_pool_select_order(ttm_pool_orders[0],
>>>>> num_pages);
>>>>> +       for (; num_pages; order = ttm_pool_select_order(order,
>>>>> num_pages)) {
>>>>>                   struct ttm_pool_type *pt;
>>>>>     
>>>>>                   page_caching = tt->caching;
>>>>> @@ -558,7 +584,7 @@ void ttm_pool_init(struct ttm_pool *pool,
>>>>> struct device *dev,
>>>>>     
>>>>>           if (use_dma_alloc) {
>>>>>                   for (i = 0; i < TTM_NUM_CACHING_TYPES; ++i)
>>>>> -                       for (j = 0; j < MAX_ORDER; ++j)
>>>>> +                       for (j = 0; j < TTM_DIM_ORDER; ++j)
>>>>>                                   ttm_pool_type_init(&pool-
>>>>>> caching[i].orders[j],
>>>>>                                                      pool, i,
>>>>> j);
>>>>>           }
>>>>> @@ -578,7 +604,7 @@ void ttm_pool_fini(struct ttm_pool *pool)
>>>>>     
>>>>>           if (pool->use_dma_alloc) {
>>>>>                   for (i = 0; i < TTM_NUM_CACHING_TYPES; ++i)
>>>>> -                       for (j = 0; j < MAX_ORDER; ++j)
>>>>> +                       for (j = 0; j < TTM_DIM_ORDER; ++j)
>>>>>                                   ttm_pool_type_fini(&pool-
>>>>>> caching[i].orders[j]);
>>>>>           }
>>>>>     
>>>>> @@ -632,7 +658,7 @@ static void ttm_pool_debugfs_header(struct
>>>>> seq_file *m)
>>>>>           unsigned int i;
>>>>>     
>>>>>           seq_puts(m, "\t ");
>>>>> -       for (i = 0; i < MAX_ORDER; ++i)
>>>>> +       for (i = 0; i < TTM_DIM_ORDER; ++i)
>>>>>                   seq_printf(m, " ---%2u---", i);
>>>>>           seq_puts(m, "\n");
>>>>>     }
>>>>> @@ -643,7 +669,7 @@ static void ttm_pool_debugfs_orders(struct
>>>>> ttm_pool_type *pt,
>>>>>     {
>>>>>           unsigned int i;
>>>>>     
>>>>> -       for (i = 0; i < MAX_ORDER; ++i)
>>>>> +       for (i = 0; i < TTM_DIM_ORDER; ++i)
>>>>>                   seq_printf(m, " %8u",
>>>>> ttm_pool_type_count(&pt[i]));
>>>>>           seq_puts(m, "\n");
>>>>>     }
>>>>> @@ -749,10 +775,16 @@ int ttm_pool_mgr_init(unsigned long
>>>>> num_pages)
>>>>>           if (!page_pool_size)
>>>>>                   page_pool_size = num_pages;
>>>>>     
>>>>> +       if (TTM_64K_ORDER < TTM_MAX_ORDER)
>>>>> +               ttm_pool_orders[1] = TTM_64K_ORDER;
>>>>> +
>>>>> +       pr_debug("Used orders are %u %u %u\n",
>>>>> ttm_pool_orders[0],
>>>>> +                ttm_pool_orders[1], ttm_pool_orders[2]);
>>>>> +
>>>>>           spin_lock_init(&shrinker_lock);
>>>>>           INIT_LIST_HEAD(&shrinker_list);
>>>>>     
>>>>> -       for (i = 0; i < MAX_ORDER; ++i) {
>>>>> +       for (i = 0; i < TTM_DIM_ORDER; ++i) {
>>>>>                   ttm_pool_type_init(&global_write_combined[i],
>>>>> NULL,
>>>>>                                      ttm_write_combined, i);
>>>>>                   ttm_pool_type_init(&global_uncached[i], NULL,
>>>>> ttm_uncached, i);
>>>>> @@ -785,7 +817,7 @@ void ttm_pool_mgr_fini(void)
>>>>>     {
>>>>>           unsigned int i;
>>>>>     
>>>>> -       for (i = 0; i < MAX_ORDER; ++i) {
>>>>> +       for (i = 0; i < TTM_DIM_ORDER; ++i) {
>>>>>                   ttm_pool_type_fini(&global_write_combined[i]);
>>>>>                   ttm_pool_type_fini(&global_uncached[i]);
>>>>>