[RFC PATCH 07/16] drm/ttm: Reduce the number of used allocation orders for TTM pages

Wed Feb 15 18:30:35 UTC 2023

Am 15.02.23 um 19:12 schrieb Thomas Hellström:
> On Wed, 2023-02-15 at 18:42 +0100, Christian König wrote:
>> Am 15.02.23 um 17:13 schrieb Thomas Hellström:
>>> When swapping out, we will split multi-order pages both in order to
>>> move them to the swap-cache and to be able to return memory to the
>>> swap cache as soon as possible on a page-by-page basis.
>>> By reducing the page max order to the system PMD size, we can be
>>> nicer
>>> to the system and avoid splitting gigantic pages.
>>
>>> On top of this we also
>>> include the 64K page size in the page sizes tried, since that
>>> appears to
>>> be a common size for GPU applications.
>> Please completely drop that.
> You mean the 64K page size, or the whole patch?

The 64K page size. This was an invention from Microsoft to standardize 
GPU handling ~15-20years ago.

It turned out to be a complete shipwreck and by now 2MiB and 1GiB pages 
or just flexible hardware which can handle everything seem to become 
standard.

>> This is just nonsense spilling in from the
>> Windows drivers.
> Agreed, but IIRC on the last RFC you asked me not to drop the 64K
> pages, so that's why they are here. I can remove them if needed.

We could keep it if it's in any way beneficial, but I'm pretty sure I 
must have been drunk to ask for that.

> The only reason for keeping them from a performance point of view is
> better efficiency on GPUs with 64K page size if not using a coalescing
> IOMMU for dma-mapping.

Are any of those still produced? As far as I know neither NVidia, Intel 
nor AMD still assumes that page size in their hardware for quite a while 
now.

Regards,
Christian.

>
> Let me know what you think is best and I'll adjust accordingly.
>
> /Thomas
>
>
>> Christian.
>>
>>> Looking forward to when we might be able to swap out PMD size
>>> folios
>>> without splitting, this will also be a benefit.
>>>
>>> Signed-off-by: Thomas Hellström <thomas.hellstrom at linux.intel.com>
>>> ---
>>>    drivers/gpu/drm/ttm/ttm_pool.c | 58 ++++++++++++++++++++++++++---
>>> -----
>>>    1 file changed, 45 insertions(+), 13 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/ttm/ttm_pool.c
>>> b/drivers/gpu/drm/ttm/ttm_pool.c
>>> index 1cc7591a9542..8787fb6a218b 100644
>>> --- a/drivers/gpu/drm/ttm/ttm_pool.c
>>> +++ b/drivers/gpu/drm/ttm/ttm_pool.c
>>> @@ -31,6 +31,8 @@
>>>     * cause they are rather slow compared to alloc_pages+map.
>>>     */
>>>    
>>> +#define pr_fmt(fmt) "[TTM POOL] " fmt
>>> +
>>>    #include <linux/module.h>
>>>    #include <linux/dma-mapping.h>
>>>    #include <linux/debugfs.h>
>>> @@ -47,6 +49,18 @@
>>>    
>>>    #include "ttm_module.h"
>>>    
>>> +#define TTM_MAX_ORDER (PMD_SHIFT - PAGE_SHIFT)
>>> +#define TTM_64K_ORDER (16 - PAGE_SHIFT)
>>> +#if (TTM_MAX_ORDER < TTM_64K_ORDER)
>>> +#undef TTM_MAX_ORDER
>>> +#define TTM_MAX_ORDER TTM_64K_ORDER
>>> +#endif
>>> +#if ((MAX_ORDER - 1) < TTM_MAX_ORDER)
>>> +#undef TTM_MAX_ORDER
>>> +#define TTM_MAX_ORDER (MAX_ORDER - 1)
>>> +#endif
>>> +#define TTM_DIM_ORDER (TTM_MAX_ORDER + 1)
>>> +
>>>    /**
>>>     * struct ttm_pool_dma - Helper object for coherent DMA mappings
>>>     *
>>> @@ -65,16 +79,18 @@ module_param(page_pool_size, ulong, 0644);
>>>    
>>>    static atomic_long_t allocated_pages;
>>>    
>>> -static struct ttm_pool_type global_write_combined[MAX_ORDER];
>>> -static struct ttm_pool_type global_uncached[MAX_ORDER];
>>> +static struct ttm_pool_type global_write_combined[TTM_DIM_ORDER];
>>> +static struct ttm_pool_type global_uncached[TTM_DIM_ORDER];
>>>    
>>> -static struct ttm_pool_type
>>> global_dma32_write_combined[MAX_ORDER];
>>> -static struct ttm_pool_type global_dma32_uncached[MAX_ORDER];
>>> +static struct ttm_pool_type
>>> global_dma32_write_combined[TTM_DIM_ORDER];
>>> +static struct ttm_pool_type global_dma32_uncached[TTM_DIM_ORDER];
>>>    
>>>    static spinlock_t shrinker_lock;
>>>    static struct list_head shrinker_list;
>>>    static struct shrinker mm_shrinker;
>>>    
>>> +static unsigned int ttm_pool_orders[] = {TTM_MAX_ORDER, 0, 0};
>>> +
>>>    /* Allocate pages of size 1 << order with the given gfp_flags */
>>>    static struct page *ttm_pool_alloc_page(struct ttm_pool *pool,
>>> gfp_t gfp_flags,
>>>                                          unsigned int order)
>>> @@ -400,6 +416,17 @@ static void __ttm_pool_free(struct ttm_pool
>>> *pool, struct ttm_tt *tt,
>>>          }
>>>    }
>>>    
>>> +static unsigned int ttm_pool_select_order(unsigned int order,
>>> pgoff_t num_pages)
>>> +{
>>> +       unsigned int *cur_order = ttm_pool_orders;
>>> +
>>> +       order = min_t(unsigned int, __fls(num_pages), order);
>>> +       while (order < *cur_order)
>>> +               ++cur_order;
>>> +
>>> +       return *cur_order;
>>> +}
>>> +
>>>    /**
>>>     * ttm_pool_alloc - Fill a ttm_tt object
>>>     *
>>> @@ -439,9 +466,8 @@ int ttm_pool_alloc(struct ttm_pool *pool,
>>> struct ttm_tt *tt,
>>>          else
>>>                  gfp_flags |= GFP_HIGHUSER;
>>>    
>>> -       for (order = min_t(unsigned int, MAX_ORDER - 1,
>>> __fls(num_pages));
>>> -            num_pages;
>>> -            order = min_t(unsigned int, order, __fls(num_pages)))
>>> {
>>> +       order = ttm_pool_select_order(ttm_pool_orders[0],
>>> num_pages);
>>> +       for (; num_pages; order = ttm_pool_select_order(order,
>>> num_pages)) {
>>>                  struct ttm_pool_type *pt;
>>>    
>>>                  page_caching = tt->caching;
>>> @@ -558,7 +584,7 @@ void ttm_pool_init(struct ttm_pool *pool,
>>> struct device *dev,
>>>    
>>>          if (use_dma_alloc) {
>>>                  for (i = 0; i < TTM_NUM_CACHING_TYPES; ++i)
>>> -                       for (j = 0; j < MAX_ORDER; ++j)
>>> +                       for (j = 0; j < TTM_DIM_ORDER; ++j)
>>>                                  ttm_pool_type_init(&pool-
>>>> caching[i].orders[j],
>>>                                                     pool, i, j);
>>>          }
>>> @@ -578,7 +604,7 @@ void ttm_pool_fini(struct ttm_pool *pool)
>>>    
>>>          if (pool->use_dma_alloc) {
>>>                  for (i = 0; i < TTM_NUM_CACHING_TYPES; ++i)
>>> -                       for (j = 0; j < MAX_ORDER; ++j)
>>> +                       for (j = 0; j < TTM_DIM_ORDER; ++j)
>>>                                  ttm_pool_type_fini(&pool-
>>>> caching[i].orders[j]);
>>>          }
>>>    
>>> @@ -632,7 +658,7 @@ static void ttm_pool_debugfs_header(struct
>>> seq_file *m)
>>>          unsigned int i;
>>>    
>>>          seq_puts(m, "\t ");
>>> -       for (i = 0; i < MAX_ORDER; ++i)
>>> +       for (i = 0; i < TTM_DIM_ORDER; ++i)
>>>                  seq_printf(m, " ---%2u---", i);
>>>          seq_puts(m, "\n");
>>>    }
>>> @@ -643,7 +669,7 @@ static void ttm_pool_debugfs_orders(struct
>>> ttm_pool_type *pt,
>>>    {
>>>          unsigned int i;
>>>    
>>> -       for (i = 0; i < MAX_ORDER; ++i)
>>> +       for (i = 0; i < TTM_DIM_ORDER; ++i)
>>>                  seq_printf(m, " %8u", ttm_pool_type_count(&pt[i]));
>>>          seq_puts(m, "\n");
>>>    }
>>> @@ -749,10 +775,16 @@ int ttm_pool_mgr_init(unsigned long
>>> num_pages)
>>>          if (!page_pool_size)
>>>                  page_pool_size = num_pages;
>>>    
>>> +       if (TTM_64K_ORDER < TTM_MAX_ORDER)
>>> +               ttm_pool_orders[1] = TTM_64K_ORDER;
>>> +
>>> +       pr_debug("Used orders are %u %u %u\n", ttm_pool_orders[0],
>>> +                ttm_pool_orders[1], ttm_pool_orders[2]);
>>> +
>>>          spin_lock_init(&shrinker_lock);
>>>          INIT_LIST_HEAD(&shrinker_list);
>>>    
>>> -       for (i = 0; i < MAX_ORDER; ++i) {
>>> +       for (i = 0; i < TTM_DIM_ORDER; ++i) {
>>>                  ttm_pool_type_init(&global_write_combined[i], NULL,
>>>                                     ttm_write_combined, i);
>>>                  ttm_pool_type_init(&global_uncached[i], NULL,
>>> ttm_uncached, i);
>>> @@ -785,7 +817,7 @@ void ttm_pool_mgr_fini(void)
>>>    {
>>>          unsigned int i;
>>>    
>>> -       for (i = 0; i < MAX_ORDER; ++i) {
>>> +       for (i = 0; i < TTM_DIM_ORDER; ++i) {
>>>                  ttm_pool_type_fini(&global_write_combined[i]);
>>>                  ttm_pool_type_fini(&global_uncached[i]);
>>>