[PATCH] drm/ttm: Implement strict NUMA pool allocations

Fri Mar 22 15:29:25 UTC 2024

>-----Original Message-----
>From: dri-devel <dri-devel-bounces at lists.freedesktop.org> On Behalf Of
>Rajneesh Bhardwaj
>Sent: Friday, March 22, 2024 3:08 AM
>To: amd-gfx at lists.freedesktop.org; dri-devel at lists.freedesktop.org
>Cc: felix.kuehling at amd.com; alexander.deucher at amd.com;
>christian.koenig at amd.com; Rajneesh Bhardwaj
><rajneesh.bhardwaj at amd.com>; Joe Greathouse
><joseph.greathouse at amd.com>
>Subject: [PATCH] drm/ttm: Implement strict NUMA pool allocations
>
>This change allows TTM to be flexible to honor NUMA localized
>allocations which can result in significant performance improvement on a
>multi socket NUMA system. On GFXIP 9.4.3 based AMD APUs, we see
>manyfold benefits of this change resulting not only in ~10% performance
>improvement in certain benchmarks but also generating more consistent
>and less sporadic results specially when the NUMA balancing is not
>explecitely disabled. In certain scenarios, workloads show a run-to-run
>variability e.g. HPL would show a ~10x performance drop after running
>back to back 4-5 times and would recover later on a subsequent run. This
>is seen with memory intensive other workloads too. It was seen that when
>CPU caches were dropped e.g. sudo sysctl -w vm.drop_caches=1, the
>variability reduced but the performance was still well below a good run.
>
>Use of __GFP_THISNODE flag ensures that during memory allocation, kernel
>prioritizes memory allocations from the local or closest NUMA node
>thereby reducing memory access latency. When memory is allocated using
>__GFP_THISNODE flag, memory allocations will predominantly be done on
>the local node, consequency, the shrinkers may priotitize reclaiming
>memory from caches assocoated with local node to maintain memory
>locality and minimize latency, thereby provide better shinker targeting.
>
>Reduced memory pressure on remote nodes, can also indirectly influence
>shrinker behavior by potentially reducing the frequency and intensity of
>memory reclamation operation on remote nodes and could provide improved
>overall system performance.
>
>While this change could be more beneficial in general, i.e., without the
>use of a module parameter, but in absence of widespread testing, limit
>it to the AMD GFXIP 9.4.3 based ttm pool initializations only.
>
>
>Cc: Joe Greathouse <joseph.greathouse at amd.com>
>Signed-off-by: Rajneesh Bhardwaj <rajneesh.bhardwaj at amd.com>
>---
> drivers/gpu/drm/amd/amdgpu/amdgpu.h       |  1 +
> drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c   |  8 ++++++++
> drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c   |  7 ++++++-
> drivers/gpu/drm/ttm/tests/ttm_pool_test.c | 10 +++++-----
> drivers/gpu/drm/ttm/ttm_device.c          |  2 +-
> drivers/gpu/drm/ttm/ttm_pool.c            |  7 ++++++-
> include/drm/ttm/ttm_pool.h                |  4 +++-
> 7 files changed, 30 insertions(+), 9 deletions(-)
>
>diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>index 9c62552bec34..96532cfc6230 100644
>--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>@@ -253,6 +253,7 @@ extern int amdgpu_user_partt_mode;
> extern int amdgpu_agp;
>
> extern int amdgpu_wbrf;
>+extern bool strict_numa_alloc;
>
> #define AMDGPU_VM_MAX_NUM_CTX			4096
> #define AMDGPU_SG_THRESHOLD			(256*1024*1024)
>diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>index 80b9642f2bc4..a183a6b4493d 100644
>--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>@@ -781,6 +781,14 @@ int queue_preemption_timeout_ms = 9000;
> module_param(queue_preemption_timeout_ms, int, 0644);
> MODULE_PARM_DESC(queue_preemption_timeout_ms, "queue preemption
>timeout in ms (1 = Minimum, 9000 = default)");
>
>+/**
>+ * DOC: strict_numa_alloc(bool)
>+ * Policy to force NUMA allocation requests from the proximity NUMA domain
>only.
>+ */
>+bool strict_numa_alloc;
>+module_param(strict_numa_alloc, bool, 0444);
>+MODULE_PARM_DESC(strict_numa_alloc, "Force NUMA allocation requests
>to be satisfied from the closest node only (false = default)");
>+
> /**
>  * DOC: debug_evictions(bool)
>  * Enable extra debug messages to help determine the cause of evictions
>diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>index b0ed10f4de60..a9f78f85e28c 100644
>--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>@@ -1768,6 +1768,7 @@ static int amdgpu_ttm_reserve_tmr(struct
>amdgpu_device *adev)
>
> static int amdgpu_ttm_pools_init(struct amdgpu_device *adev)
> {
>+	bool policy = true;
> 	int i;
>
> 	if (!adev->gmc.is_app_apu || !adev->gmc.num_mem_partitions)
>@@ -1779,11 +1780,15 @@ static int amdgpu_ttm_pools_init(struct
>amdgpu_device *adev)
> 	if (!adev->mman.ttm_pools)
> 		return -ENOMEM;
>
>+	/* Policy not only depends on the module param but also on the ASIC
>+	 * setting use_strict_numa_alloc as well.
>+	 */
> 	for (i = 0; i < adev->gmc.num_mem_partitions; i++) {
> 		ttm_pool_init(&adev->mman.ttm_pools[i], adev->dev,
> 			      adev->gmc.mem_partitions[i].numa.node,
>-			      false, false);
>+			      false, false, policy && strict_numa_alloc);

why not just 'strict_numa_alloc'?

Is 'policy' used somewhere else?  Not sure this adds clarity...

> 	}
>+
> 	return 0;
> }
>
>diff --git a/drivers/gpu/drm/ttm/tests/ttm_pool_test.c
>b/drivers/gpu/drm/ttm/tests/ttm_pool_test.c
>index 2d9cae8cd984..6ff47aac570a 100644
>--- a/drivers/gpu/drm/ttm/tests/ttm_pool_test.c
>+++ b/drivers/gpu/drm/ttm/tests/ttm_pool_test.c
>@@ -87,7 +87,7 @@ static struct ttm_pool *ttm_pool_pre_populated(struct
>kunit *test,
> 	pool = kunit_kzalloc(test, sizeof(*pool), GFP_KERNEL);
> 	KUNIT_ASSERT_NOT_NULL(test, pool);
>
>-	ttm_pool_init(pool, devs->dev, NUMA_NO_NODE, true, false);
>+	ttm_pool_init(pool, devs->dev, NUMA_NO_NODE, true, false, false);

Is this really an acceptable interface (three non-named Booleans in a row)?

I have no idea what "true, false, false" means.

Would having a "flags" parameter and then

USE_DMA_ALLOC 	BIT(0)
USE_DMA32		BIT(1)
USE_STRICT_NUMA	BIT(2)

Make this more readable?

Or define your Booleans:

USE_DMA_ALLOC 	true
USE_DMA32		true
USE_STRICT_NUMA	true

NO_DMA_ALLOC	false
NO_DMA32		false
NO_STRICT_NUMA	false

So at a minimum, we might know what these parameters are?

What is the relationship between this feature and the nid value?

Is this value used for the allocations?

If this is not NUMA_NO_NODE, would this do the same thing?
(or is the STRICT flag the only way?)

Just some thoughts,

Mike

>
> 	err = ttm_pool_alloc(pool, tt, &simple_ctx);
> 	KUNIT_ASSERT_EQ(test, err, 0);
>@@ -152,7 +152,7 @@ static void ttm_pool_alloc_basic(struct kunit *test)
> 	KUNIT_ASSERT_NOT_NULL(test, pool);
>
> 	ttm_pool_init(pool, devs->dev, NUMA_NO_NODE, params-
>>use_dma_alloc,
>-		      false);
>+		      false, false);
>
> 	KUNIT_ASSERT_PTR_EQ(test, pool->dev, devs->dev);
> 	KUNIT_ASSERT_EQ(test, pool->nid, NUMA_NO_NODE);
>@@ -219,7 +219,7 @@ static void ttm_pool_alloc_basic_dma_addr(struct
>kunit *test)
> 	pool = kunit_kzalloc(test, sizeof(*pool), GFP_KERNEL);
> 	KUNIT_ASSERT_NOT_NULL(test, pool);
>
>-	ttm_pool_init(pool, devs->dev, NUMA_NO_NODE, true, false);
>+	ttm_pool_init(pool, devs->dev, NUMA_NO_NODE, true, false, false);
>
> 	err = ttm_pool_alloc(pool, tt, &simple_ctx);
> 	KUNIT_ASSERT_EQ(test, err, 0);
>@@ -349,7 +349,7 @@ static void ttm_pool_free_dma_alloc(struct kunit
>*test)
> 	pool = kunit_kzalloc(test, sizeof(*pool), GFP_KERNEL);
> 	KUNIT_ASSERT_NOT_NULL(test, pool);
>
>-	ttm_pool_init(pool, devs->dev, NUMA_NO_NODE, true, false);
>+	ttm_pool_init(pool, devs->dev, NUMA_NO_NODE, true, false, false);
> 	ttm_pool_alloc(pool, tt, &simple_ctx);
>
> 	pt = &pool->caching[caching].orders[order];
>@@ -380,7 +380,7 @@ static void ttm_pool_free_no_dma_alloc(struct kunit
>*test)
> 	pool = kunit_kzalloc(test, sizeof(*pool), GFP_KERNEL);
> 	KUNIT_ASSERT_NOT_NULL(test, pool);
>
>-	ttm_pool_init(pool, devs->dev, NUMA_NO_NODE, false, false);
>+	ttm_pool_init(pool, devs->dev, NUMA_NO_NODE, false, false, false);
> 	ttm_pool_alloc(pool, tt, &simple_ctx);
>
> 	pt = &pool->caching[caching].orders[order];
>diff --git a/drivers/gpu/drm/ttm/ttm_device.c
>b/drivers/gpu/drm/ttm/ttm_device.c
>index f5187b384ae9..540e8a44f015 100644
>--- a/drivers/gpu/drm/ttm/ttm_device.c
>+++ b/drivers/gpu/drm/ttm/ttm_device.c
>@@ -215,7 +215,7 @@ int ttm_device_init(struct ttm_device *bdev, const
>struct ttm_device_funcs *func
>
> 	ttm_sys_man_init(bdev);
>
>-	ttm_pool_init(&bdev->pool, dev, dev_to_node(dev), use_dma_alloc,
>use_dma32);
>+	ttm_pool_init(&bdev->pool, dev, dev_to_node(dev), use_dma_alloc,
>use_dma32, false);
>
> 	bdev->vma_manager = vma_manager;
> 	spin_lock_init(&bdev->lru_lock);
>diff --git a/drivers/gpu/drm/ttm/ttm_pool.c
>b/drivers/gpu/drm/ttm/ttm_pool.c
>index dbc96984d331..73aafd06c361 100644
>--- a/drivers/gpu/drm/ttm/ttm_pool.c
>+++ b/drivers/gpu/drm/ttm/ttm_pool.c
>@@ -447,6 +447,9 @@ int ttm_pool_alloc(struct ttm_pool *pool, struct
>ttm_tt *tt,
> 	else
> 		gfp_flags |= GFP_HIGHUSER;
>
>+	if (pool->use_strict_numa_alloc)
>+		gfp_flags |= __GFP_THISNODE;
>+
> 	for (order = min_t(unsigned int, MAX_ORDER, __fls(num_pages));
> 	     num_pages;
> 	     order = min_t(unsigned int, order, __fls(num_pages))) {
>@@ -555,7 +558,8 @@ EXPORT_SYMBOL(ttm_pool_free);
>  * Initialize the pool and its pool types.
>  */
> void ttm_pool_init(struct ttm_pool *pool, struct device *dev,
>-		   int nid, bool use_dma_alloc, bool use_dma32)
>+		   int nid, bool use_dma_alloc, bool use_dma32,
>+		   bool use_strict_numa_alloc)
> {
> 	unsigned int i, j;
>
>@@ -565,6 +569,7 @@ void ttm_pool_init(struct ttm_pool *pool, struct
>device *dev,
> 	pool->nid = nid;
> 	pool->use_dma_alloc = use_dma_alloc;
> 	pool->use_dma32 = use_dma32;
>+	pool->use_strict_numa_alloc = use_strict_numa_alloc;
>
> 	if (use_dma_alloc || nid != NUMA_NO_NODE) {
> 		for (i = 0; i < TTM_NUM_CACHING_TYPES; ++i)
>diff --git a/include/drm/ttm/ttm_pool.h b/include/drm/ttm/ttm_pool.h
>index 30a347e5aa11..6b7bdc952466 100644
>--- a/include/drm/ttm/ttm_pool.h
>+++ b/include/drm/ttm/ttm_pool.h
>@@ -72,6 +72,7 @@ struct ttm_pool {
>
> 	bool use_dma_alloc;
> 	bool use_dma32;
>+	bool use_strict_numa_alloc;
>
> 	struct {
> 		struct ttm_pool_type orders[MAX_ORDER + 1];
>@@ -83,7 +84,8 @@ int ttm_pool_alloc(struct ttm_pool *pool, struct ttm_tt
>*tt,
> void ttm_pool_free(struct ttm_pool *pool, struct ttm_tt *tt);
>
> void ttm_pool_init(struct ttm_pool *pool, struct device *dev,
>-		   int nid, bool use_dma_alloc, bool use_dma32);
>+		   int nid, bool use_dma_alloc, bool use_dma32,
>+		   bool use_strict_numa_alloc);
> void ttm_pool_fini(struct ttm_pool *pool);
>
> int ttm_pool_debugfs(struct ttm_pool *pool, struct seq_file *m);
>--
>2.34.1