[PATCH 3/3] drm/ttm: disable changing the global caching flags on newer AMD CPUs v2

Christian König ckoenig.leichtzumerken at gmail.com
Wed Aug 20 14:33:13 UTC 2025


On some x86 systems (old AMD Athlons, Intel Luna Lake) we have the problem
that changing the caching flags of system memory requires changing the
global MTRR/PAT tables since those CPUs can't handle aliasing caching
attributes.

But on most modern x86 system (e.g. AMD CPUs after 2004) we actually
don't need that any more and can update the caching flags directly in the
PTEs of the userspace and kernel mappings.

We already do this with encryption on x86 64bit for quite a while and all
other supported platforms (Sparc, PowerPC, ARM, MIPS, LONGARCH) as well as
the i915 driver have never done anything different either.

So stop changing the global chaching flags for CPU systems which don't
need it and just insert a clflush to be on the safe side so that we never
return memory with dirty cache lines.

Testing on a Ryzen 5 and 7 shows that the clflush has absolutely no
performance impact, but I'm still waiting for CI systems to confirm
functional correctness.

v2: drop the pool only on AMD CPUs for now

Signed-off-by: Christian König <christian.koenig at amd.com>
---
 drivers/gpu/drm/ttm/ttm_pool.c | 37 +++++++++++++++++++++++++---------
 1 file changed, 28 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/ttm/ttm_pool.c b/drivers/gpu/drm/ttm/ttm_pool.c
index 83b10706ba89..3f830fb2aea5 100644
--- a/drivers/gpu/drm/ttm/ttm_pool.c
+++ b/drivers/gpu/drm/ttm/ttm_pool.c
@@ -45,6 +45,7 @@
 #include <drm/ttm/ttm_pool.h>
 #include <drm/ttm/ttm_tt.h>
 #include <drm/ttm/ttm_bo.h>
+#include <drm/drm_cache.h>
 
 #include "ttm_module.h"
 
@@ -119,6 +120,8 @@ module_param(page_pool_size, ulong, 0644);
 
 static atomic_long_t allocated_pages;
 
+static bool skip_caching_adjustment;
+
 static struct ttm_pool_type global_write_combined[NR_PAGE_ORDERS];
 static struct ttm_pool_type global_uncached[NR_PAGE_ORDERS];
 
@@ -195,7 +198,8 @@ static void ttm_pool_free_page(struct ttm_pool *pool, enum ttm_caching caching,
 	/* We don't care that set_pages_wb is inefficient here. This is only
 	 * used when we have to shrink and CPU overhead is irrelevant then.
 	 */
-	if (caching != ttm_cached && !PageHighMem(p))
+	if (!skip_caching_adjustment &&
+	    caching != ttm_cached && !PageHighMem(p))
 		set_pages_wb(p, 1 << order);
 #endif
 
@@ -223,13 +227,19 @@ static int ttm_pool_apply_caching(struct ttm_pool_alloc_state *alloc)
 	if (!num_pages)
 		return 0;
 
-	switch (alloc->tt_caching) {
-	case ttm_cached:
-		break;
-	case ttm_write_combined:
-		return set_pages_array_wc(alloc->caching_divide, num_pages);
-	case ttm_uncached:
-		return set_pages_array_uc(alloc->caching_divide, num_pages);
+	if (skip_caching_adjustment) {
+		drm_clflush_pages(alloc->caching_divide, num_pages);
+	} else {
+		switch (alloc->tt_caching) {
+		case ttm_cached:
+			break;
+		case ttm_write_combined:
+			return set_pages_array_wc(alloc->caching_divide,
+						  num_pages);
+		case ttm_uncached:
+			return set_pages_array_uc(alloc->caching_divide,
+						  num_pages);
+		}
 	}
 #endif
 	alloc->caching_divide = alloc->pages;
@@ -342,6 +352,9 @@ static struct ttm_pool_type *ttm_pool_select_type(struct ttm_pool *pool,
 		return &pool->caching[caching].orders[order];
 
 #ifdef CONFIG_X86
+	if (skip_caching_adjustment)
+		return NULL;
+
 	switch (caching) {
 	case ttm_write_combined:
 		if (pool->nid != NUMA_NO_NODE)
@@ -981,7 +994,7 @@ long ttm_pool_backup(struct ttm_pool *pool, struct ttm_tt *tt,
 
 #ifdef CONFIG_X86
 	/* Anything returned to the system needs to be cached. */
-	if (tt->caching != ttm_cached)
+	if (!skip_caching_adjustment && tt->caching != ttm_cached)
 		set_pages_array_wb(tt->pages, tt->num_pages);
 #endif
 
@@ -1296,6 +1309,12 @@ int ttm_pool_mgr_init(unsigned long num_pages)
 	spin_lock_init(&shrinker_lock);
 	INIT_LIST_HEAD(&shrinker_list);
 
+#ifdef CONFIG_X86
+	skip_caching_adjustment =
+		(boot_cpu_data.x86_vendor == X86_VENDOR_AMD) &&
+		static_cpu_has(X86_FEATURE_CLFLUSH);
+#endif
+
 	for (i = 0; i < NR_PAGE_ORDERS; ++i) {
 		ttm_pool_type_init(&global_write_combined[i], NULL,
 				   ttm_write_combined, i);
-- 
2.43.0



More information about the Intel-xe mailing list