[PATCH] drm/ttm: WIP limit the TTM pool to 32bit CPUs

Tue Aug 12 10:36:04 UTC 2025

On 06/08/2025 14:28, Christian König wrote:
> On some old x86 systems we had the problem that changing the caching flags
> of system memory requires changing the global MTRR/PAT tables.
> 
> But on any modern x86 system (CPUs introduced rughly after 2004) we
> actually don't need that any more and can update the caching flags
> directly in the PTEs of the CPU mapping. It was just never disabled
> because of the fear of regressions.
> 
> We already use the PTE flags for encryption on x86 64bit for quite a while
> and all other supported platforms (Sparc, PowerPC, ARM, MIPS, LONGARCH)
> have never done anything different either.
> 
> So disable the page pool completely for 64bit systems and just insert a
> clflush to be on the safe side so that we never return memory with dirty
> cache lines.
> 
> Testing on a Ryzen 5 and 7 shows that this has absolutely no performance
> impact and of hand the AMD CI can't find a problem either.
> 
> Let's see what the i915 and XE CI systems say to that.
> 
> Signed-off-by: Christian König <christian.koenig at amd.com>
> ---
>   drivers/gpu/drm/ttm/ttm_pool.c | 16 +++++++++++-----
>   1 file changed, 11 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/gpu/drm/ttm/ttm_pool.c b/drivers/gpu/drm/ttm/ttm_pool.c
> index baf27c70a419..7487eac29398 100644
> --- a/drivers/gpu/drm/ttm/ttm_pool.c
> +++ b/drivers/gpu/drm/ttm/ttm_pool.c
> @@ -38,7 +38,7 @@
>   #include <linux/highmem.h>
>   #include <linux/sched/mm.h>
>   
> -#ifdef CONFIG_X86
> +#ifdef CONFIG_X86_32
>   #include <asm/set_memory.h>
>   #endif
>   
> @@ -46,6 +46,7 @@
>   #include <drm/ttm/ttm_pool.h>
>   #include <drm/ttm/ttm_tt.h>
>   #include <drm/ttm/ttm_bo.h>
> +#include <drm/drm_cache.h>
>   
>   #include "ttm_module.h"
>   
> @@ -192,7 +193,7 @@ static void ttm_pool_free_page(struct ttm_pool *pool, enum ttm_caching caching,
>   	struct ttm_pool_dma *dma;
>   	void *vaddr;
>   
> -#ifdef CONFIG_X86
> +#ifdef CONFIG_X86_32
>   	/* We don't care that set_pages_wb is inefficient here. This is only
>   	 * used when we have to shrink and CPU overhead is irrelevant then.
>   	 */
> @@ -218,7 +219,7 @@ static void ttm_pool_free_page(struct ttm_pool *pool, enum ttm_caching caching,
>   /* Apply any cpu-caching deferred during page allocation */
>   static int ttm_pool_apply_caching(struct ttm_pool_alloc_state *alloc)
>   {
> -#ifdef CONFIG_X86
> +#ifdef CONFIG_X86_32
>   	unsigned int num_pages = alloc->pages - alloc->caching_divide;
>   
>   	if (!num_pages)
> @@ -232,6 +233,11 @@ static int ttm_pool_apply_caching(struct ttm_pool_alloc_state *alloc)
>   	case ttm_uncached:
>   		return set_pages_array_uc(alloc->caching_divide, num_pages);
>   	}
> +
> +#elif defined(CONFIG_X86_64)
> +	unsigned int num_pages = alloc->pages - alloc->caching_divide;
> +
> +	drm_clflush_pages(alloc->caching_divide, num_pages);

Do we now also need manual clflushing for things like swap-in? Or what 
caching type does copy_highpage() in ttm_tt_swapin() use now? Is it not 
wb? If so, is that not one of the nice advantages of using set_pages_* 
where raw kmap will respect the PAT setting?

>   #endif
>   	alloc->caching_divide = alloc->pages;
>   	return 0;
> @@ -342,7 +348,7 @@ static struct ttm_pool_type *ttm_pool_select_type(struct ttm_pool *pool,
>   	if (pool->use_dma_alloc)
>   		return &pool->caching[caching].orders[order];
>   
> -#ifdef CONFIG_X86
> +#ifdef CONFIG_X86_32
>   	switch (caching) {
>   	case ttm_write_combined:
>   		if (pool->nid != NUMA_NO_NODE)
> @@ -980,7 +986,7 @@ long ttm_pool_backup(struct ttm_pool *pool, struct ttm_tt *tt,
>   	    pool->use_dma_alloc || ttm_tt_is_backed_up(tt))
>   		return -EBUSY;
>   
> -#ifdef CONFIG_X86
> +#ifdef CONFIG_X86_32
>   	/* Anything returned to the system needs to be cached. */
>   	if (tt->caching != ttm_cached)
>   		set_pages_array_wb(tt->pages, tt->num_pages);