[RFC PATCH 1/2] drm/ttm/pool: Introduce a way to skip clear on free
Nirmoy Das
nirmoy.das at intel.com
Thu Jun 20 14:37:47 UTC 2024
Hi Christian,
On 6/20/2024 4:08 PM, Christian König wrote:
> Am 20.06.24 um 15:46 schrieb Nirmoy Das:
>> Clearing pages can be very slow when using CPU but GPUs can perform this
>> task much faster. With this new pool API driver can decide if it
>> wants to
>> clear pages using GPU. This provides the flexibility to choose the
>> desired
>> clear policy, either during allocation or upon freeing, as per the
>> driver's
>> preference.
>
> We already have the TTM_TT_FLAG_ZERO_ALLOC to indicate if pages needs
> to be cleared on alloc from the OS.
>
> I'm not sure if we really need the option to not clear them in the
> pool as well, but if we really need this I suggest to switch from
> clear on free to clear on alloc again and just honor the flag.
Perf reported higher latency because of clearing pages before giving
back to the pool. I think it would be nice if drm driver could avoid it.
I can modify this to move clearing page to ttm_pool_type_take() to
honor TTM_TT_FLAG_ZERO_ALLOC flags.
>
> Alternatively you could also split the pools into cleared and not
> cleared pages as well.
Could you expand this please ?
I have another question. Our userspace team have found that there is
higher latency for ttm_cached type buffer as well and using gpu clear
doesn't help much
because kernel will clear pages anyways if
alloc_on_init/CONFIG_INIT_ON_ALLOC_DEFAULT_ON is active. I see that only
way to mitigate this is to use a pool for
ttm_cached buffers. I was thinking of using a pool flag to also allow
drm driver to create a pool for ttm_cached. I wonder what do you think
about it and if
there is any other better solution.
Thanks,
Nirmoy
>
> Regards,
> Christian.
>
>>
>> Cc: Christian Koenig <christian.koenig at amd.com>
>> Cc: "Thomas Hellström" <thomas.hellstrom at linux.intel.com>
>> Cc: Matthew Auld <matthew.auld at intel.com>
>> Signed-off-by: Nirmoy Das <nirmoy.das at intel.com>
>> ---
>> drivers/gpu/drm/ttm/ttm_device.c | 42 +++++++++++++++++++++++----
>> drivers/gpu/drm/ttm/ttm_pool.c | 49 +++++++++++++++++++++++++-------
>> include/drm/ttm/ttm_device.h | 8 ++++++
>> include/drm/ttm/ttm_pool.h | 11 +++++++
>> 4 files changed, 94 insertions(+), 16 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/ttm/ttm_device.c
>> b/drivers/gpu/drm/ttm/ttm_device.c
>> index 434cf0258000..54a3ea825c2e 100644
>> --- a/drivers/gpu/drm/ttm/ttm_device.c
>> +++ b/drivers/gpu/drm/ttm/ttm_device.c
>> @@ -191,15 +191,19 @@ EXPORT_SYMBOL(ttm_device_swapout);
>> * @vma_manager: A pointer to a vma manager.
>> * @use_dma_alloc: If coherent DMA allocation API should be used.
>> * @use_dma32: If we should use GFP_DMA32 for device memory
>> allocations.
>> + * @pool_flags: Desired pool flags
>> *
>> * Initializes a struct ttm_device:
>> * Returns:
>> * !0: Failure.
>> */
>> -int ttm_device_init(struct ttm_device *bdev, const struct
>> ttm_device_funcs *funcs,
>> - struct device *dev, struct address_space *mapping,
>> - struct drm_vma_offset_manager *vma_manager,
>> - bool use_dma_alloc, bool use_dma32)
>> +int ttm_device_init_with_pool_flags(struct ttm_device *bdev,
>> + const struct ttm_device_funcs *funcs,
>> + struct device *dev,
>> + struct address_space *mapping,
>> + struct drm_vma_offset_manager *vma_manager,
>> + bool use_dma_alloc, bool use_dma32,
>> + unsigned int pool_flags)
>> {
>> struct ttm_global *glob = &ttm_glob;
>> int ret, nid;
>> @@ -227,7 +231,8 @@ int ttm_device_init(struct ttm_device *bdev,
>> const struct ttm_device_funcs *func
>> else
>> nid = NUMA_NO_NODE;
>> - ttm_pool_init(&bdev->pool, dev, nid, use_dma_alloc, use_dma32);
>> + ttm_pool_init_with_flags(&bdev->pool, dev, nid, use_dma_alloc,
>> + use_dma32, pool_flags);
>> bdev->vma_manager = vma_manager;
>> spin_lock_init(&bdev->lru_lock);
>> @@ -239,6 +244,33 @@ int ttm_device_init(struct ttm_device *bdev,
>> const struct ttm_device_funcs *func
>> return 0;
>> }
>> +EXPORT_SYMBOL(ttm_device_init_with_pool_flags);
>> +
>> +
>> +/**
>> + * ttm_device_init
>> + *
>> + * @bdev: A pointer to a struct ttm_device to initialize.
>> + * @funcs: Function table for the device.
>> + * @dev: The core kernel device pointer for DMA mappings and
>> allocations.
>> + * @mapping: The address space to use for this bo.
>> + * @vma_manager: A pointer to a vma manager.
>> + * @use_dma_alloc: If coherent DMA allocation API should be used.
>> + * @use_dma32: If we should use GFP_DMA32 for device memory
>> allocations.
>> + *
>> + * Initializes a struct ttm_device:
>> + * Returns:
>> + * !0: Failure.
>> + */
>> +int ttm_device_init(struct ttm_device *bdev, const struct
>> ttm_device_funcs *funcs,
>> + struct device *dev, struct address_space *mapping,
>> + struct drm_vma_offset_manager *vma_manager,
>> + bool use_dma_alloc, bool use_dma32)
>> +{
>> + return ttm_device_init_with_pool_flags(bdev, funcs, dev, mapping,
>> + vma_manager, use_dma_alloc,
>> + use_dma32, 0);
>> +}
>> EXPORT_SYMBOL(ttm_device_init);
>> void ttm_device_fini(struct ttm_device *bdev)
>> diff --git a/drivers/gpu/drm/ttm/ttm_pool.c
>> b/drivers/gpu/drm/ttm/ttm_pool.c
>> index 6e1fd6985ffc..6f33c3e7cdf2 100644
>> --- a/drivers/gpu/drm/ttm/ttm_pool.c
>> +++ b/drivers/gpu/drm/ttm/ttm_pool.c
>> @@ -222,15 +222,17 @@ static void ttm_pool_unmap(struct ttm_pool
>> *pool, dma_addr_t dma_addr,
>> }
>> /* Give pages into a specific pool_type */
>> -static void ttm_pool_type_give(struct ttm_pool_type *pt, struct page
>> *p)
>> +static void ttm_pool_type_give(struct ttm_pool_type *pt, struct page
>> *p, bool skip_clear)
>> {
>> unsigned int i, num_pages = 1 << pt->order;
>> - for (i = 0; i < num_pages; ++i) {
>> - if (PageHighMem(p))
>> - clear_highpage(p + i);
>> - else
>> - clear_page(page_address(p + i));
>> + if (!skip_clear) {
>> + for (i = 0; i < num_pages; ++i) {
>> + if (PageHighMem(p))
>> + clear_highpage(p + i);
>> + else
>> + clear_page(page_address(p + i));
>> + }
>> }
>> spin_lock(&pt->lock);
>> @@ -396,7 +398,10 @@ static void ttm_pool_free_range(struct ttm_pool
>> *pool, struct ttm_tt *tt,
>> struct page **pages = &tt->pages[start_page];
>> unsigned int order;
>> pgoff_t i, nr;
>> + bool skip_clear = false;
>> + if (pool->flags & TTM_POOL_FLAG_SKIP_CLEAR_ON_FREE)
>> + skip_clear = true;
>> for (i = start_page; i < end_page; i += nr, pages += nr) {
>> struct ttm_pool_type *pt = NULL;
>> @@ -407,7 +412,7 @@ static void ttm_pool_free_range(struct ttm_pool
>> *pool, struct ttm_tt *tt,
>> pt = ttm_pool_select_type(pool, caching, order);
>> if (pt)
>> - ttm_pool_type_give(pt, *pages);
>> + ttm_pool_type_give(pt, *pages, skip_clear);
>> else
>> ttm_pool_free_page(pool, caching, order, *pages);
>> }
>> @@ -550,18 +555,21 @@ void ttm_pool_free(struct ttm_pool *pool,
>> struct ttm_tt *tt)
>> EXPORT_SYMBOL(ttm_pool_free);
>> /**
>> - * ttm_pool_init - Initialize a pool
>> + * ttm_pool_init_with_flags - Initialize a pool with flags
>> *
>> * @pool: the pool to initialize
>> * @dev: device for DMA allocations and mappings
>> * @nid: NUMA node to use for allocations
>> * @use_dma_alloc: true if coherent DMA alloc should be used
>> * @use_dma32: true if GFP_DMA32 should be used
>> + * @flags: control flags for the pool
>> + *
>> + * Initialize the pool and its pool types with flags to modify defaults
>> *
>> - * Initialize the pool and its pool types.
>> */
>> -void ttm_pool_init(struct ttm_pool *pool, struct device *dev,
>> - int nid, bool use_dma_alloc, bool use_dma32)
>> +void ttm_pool_init_with_flags(struct ttm_pool *pool, struct device
>> *dev,
>> + int nid, bool use_dma_alloc, bool use_dma32,
>> + unsigned int flags)
>> {
>> unsigned int i, j;
>> @@ -571,6 +579,7 @@ void ttm_pool_init(struct ttm_pool *pool,
>> struct device *dev,
>> pool->nid = nid;
>> pool->use_dma_alloc = use_dma_alloc;
>> pool->use_dma32 = use_dma32;
>> + pool->flags = flags;
>> for (i = 0; i < TTM_NUM_CACHING_TYPES; ++i) {
>> for (j = 0; j < NR_PAGE_ORDERS; ++j) {
>> @@ -585,6 +594,24 @@ void ttm_pool_init(struct ttm_pool *pool, struct
>> device *dev,
>> }
>> }
>> }
>> +EXPORT_SYMBOL(ttm_pool_init_with_flags);
>> +
>> +/**
>> + * ttm_pool_init - Initialize a pool
>> + *
>> + * @pool: the pool to initialize
>> + * @dev: device for DMA allocations and mappings
>> + * @nid: NUMA node to use for allocations
>> + * @use_dma_alloc: true if coherent DMA alloc should be used
>> + * @use_dma32: true if GFP_DMA32 should be used
>> + *
>> + * Initialize the pool and its pool types.
>> + */
>> +void ttm_pool_init(struct ttm_pool *pool, struct device *dev,
>> + int nid, bool use_dma_alloc, bool use_dma32)
>> +{
>> + ttm_pool_init_with_flags(pool, dev, nid, use_dma_alloc,
>> use_dma32, 0);
>> +}
>> EXPORT_SYMBOL(ttm_pool_init);
>> /**
>> diff --git a/include/drm/ttm/ttm_device.h b/include/drm/ttm/ttm_device.h
>> index c22f30535c84..1b20c5798e97 100644
>> --- a/include/drm/ttm/ttm_device.h
>> +++ b/include/drm/ttm/ttm_device.h
>> @@ -291,6 +291,14 @@ int ttm_device_init(struct ttm_device *bdev,
>> const struct ttm_device_funcs *func
>> struct device *dev, struct address_space *mapping,
>> struct drm_vma_offset_manager *vma_manager,
>> bool use_dma_alloc, bool use_dma32);
>> +int ttm_device_init_with_pool_flags(struct ttm_device *bdev,
>> + const struct ttm_device_funcs *funcs,
>> + struct device *dev,
>> + struct address_space *mapping,
>> + struct drm_vma_offset_manager *vma_manager,
>> + bool use_dma_alloc, bool use_dma32,
>> + unsigned int pool_flags);
>> +
>> void ttm_device_fini(struct ttm_device *bdev);
>> void ttm_device_clear_dma_mappings(struct ttm_device *bdev);
>> diff --git a/include/drm/ttm/ttm_pool.h b/include/drm/ttm/ttm_pool.h
>> index 160d954a261e..9822996309e5 100644
>> --- a/include/drm/ttm/ttm_pool.h
>> +++ b/include/drm/ttm/ttm_pool.h
>> @@ -66,10 +66,17 @@ struct ttm_pool_type {
>> * @use_dma_alloc: if coherent DMA allocations should be used
>> * @use_dma32: if GFP_DMA32 should be used
>> * @caching: pools for each caching/order
>> + * @flags: flags to control certain pool behaviour
>> + *
>> + * The @flags can be:
>> + * - %TTM_POOL_FLAG_SKIP_CLEAR_ON_FREE - This flag can be used to
>> + * skip clear on free when driver decides to do that on it's own.
>> */
>> struct ttm_pool {
>> struct device *dev;
>> int nid;
>> +#define TTM_POOL_FLAG_SKIP_CLEAR_ON_FREE 1 << 0
>> + unsigned int flags;
>> bool use_dma_alloc;
>> bool use_dma32;
>> @@ -85,6 +92,10 @@ void ttm_pool_free(struct ttm_pool *pool, struct
>> ttm_tt *tt);
>> void ttm_pool_init(struct ttm_pool *pool, struct device *dev,
>> int nid, bool use_dma_alloc, bool use_dma32);
>> +void ttm_pool_init_with_flags(struct ttm_pool *pool, struct device
>> *dev,
>> + int nid, bool use_dma_alloc, bool use_dma32,
>> + unsigned int flags);
>> +
>> void ttm_pool_fini(struct ttm_pool *pool);
>> int ttm_pool_debugfs(struct ttm_pool *pool, struct seq_file *m);
>
More information about the dri-devel
mailing list