[PATCH v2 1/2] drm/buddy: Optimize free block management with RB tree
Matthew Auld
matthew.auld at intel.com
Wed Aug 20 10:14:39 UTC 2025
On 20/08/2025 08:56, Arunpravin Paneer Selvam wrote:
> Hi Matthew,
>
> On 8/14/2025 4:15 PM, Matthew Auld wrote:
>> On 24/07/2025 11:46, Arunpravin Paneer Selvam wrote:
>>> Replace the freelist (O(n)) used for free block management with a
>>> red-black tree, providing more efficient O(log n) search, insert,
>>> and delete operations. This improves scalability and performance
>>> when managing large numbers of free blocks per order (e.g., hundreds
>>> or thousands).
>>>
>>> In the VK-CTS memory stress subtest, the buddy manager merges
>>> fragmented memory and inserts freed blocks into the freelist. Since
>>> freelist insertion is O(n), this becomes a bottleneck as fragmentation
>>> increases. Benchmarking shows list_insert_sorted() consumes ~52.69% CPU
>>> with the freelist, compared to just 0.03% with the RB tree
>>> (rbtree_insert.isra.0), despite performing the same sorted insert.
>>>
>>> This also improves performance in heavily fragmented workloads,
>>> such as games or graphics tests that stress memory.
>>
>> Neat. Also please Cc intel-gfx at lists.freedesktop.org and intel-
>> xe at lists.freedesktop.org on the next revision so our CI can pick this up.
> Sure, I will add on v3.
>>
>>>
>>> Signed-off-by: Arunpravin Paneer Selvam
>>> <Arunpravin.PaneerSelvam at amd.com>
>>> ---
>>> drivers/gpu/drm/drm_buddy.c | 141 +++++++++++++++++++++++-------------
>>> include/drm/drm_buddy.h | 39 +++++++++-
>>> 2 files changed, 128 insertions(+), 52 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c
>>> index a1e652b7631d..19e9773b41be 100644
>>> --- a/drivers/gpu/drm/drm_buddy.c
>>> +++ b/drivers/gpu/drm/drm_buddy.c
>>> @@ -31,6 +31,8 @@ static struct drm_buddy_block
>>> *drm_block_alloc(struct drm_buddy *mm,
>>> block->header |= order;
>>> block->parent = parent;
>>> + RB_CLEAR_NODE(&block->rb);
>>> +
>>> BUG_ON(block->header & DRM_BUDDY_HEADER_UNUSED);
>>> return block;
>>> }
>>> @@ -41,23 +43,53 @@ static void drm_block_free(struct drm_buddy *mm,
>>> kmem_cache_free(slab_blocks, block);
>>> }
>>> -static void list_insert_sorted(struct drm_buddy *mm,
>>> - struct drm_buddy_block *block)
>>> +static void rbtree_insert(struct drm_buddy *mm,
>>> + struct drm_buddy_block *block)
>>> {
>>> + struct rb_root *root = &mm-
>>> >free_tree[drm_buddy_block_order(block)];
>>> + struct rb_node **link = &root->rb_node;
>>> + struct rb_node *parent = NULL;
>>> struct drm_buddy_block *node;
>>> - struct list_head *head;
>>> + u64 offset;
>>> +
>>> + offset = drm_buddy_block_offset(block);
>>> - head = &mm->free_list[drm_buddy_block_order(block)];
>>> - if (list_empty(head)) {
>>> - list_add(&block->link, head);
>>> - return;
>>> + while (*link) {
>>> + parent = *link;
>>> + node = rb_entry(parent, struct drm_buddy_block, rb);
>>> +
>>> + if (offset < drm_buddy_block_offset(node))
>>> + link = &parent->rb_left;
>>> + else
>>> + link = &parent->rb_right;
>>> }
>>> - list_for_each_entry(node, head, link)
>>> - if (drm_buddy_block_offset(block) <
>>> drm_buddy_block_offset(node))
>>> - break;
>>> + rb_link_node(&block->rb, parent, link);
>>> + rb_insert_color(&block->rb, root);
>>> +}
>>> +
>>> +static void rbtree_remove(struct drm_buddy *mm,
>>> + struct drm_buddy_block *block)
>>> +{
>>> + struct rb_root *root;
>>> - __list_add(&block->link, node->link.prev, &node->link);
>>> + root = &mm->free_tree[drm_buddy_block_order(block)];
>>> + rb_erase(&block->rb, root);
>>> +
>>> + RB_CLEAR_NODE(&block->rb);
>>> +}
>>> +
>>> +static inline struct drm_buddy_block *
>>> +rbtree_last_entry(struct drm_buddy *mm, unsigned int order)
>>> +{
>>> + struct rb_node *node = rb_last(&mm->free_tree[order]);
>>> +
>>> + return node ? rb_entry(node, struct drm_buddy_block, rb) : NULL;
>>> +}
>>> +
>>> +static bool rbtree_is_empty(struct drm_buddy *mm, unsigned int order)
>>> +{
>>> + return RB_EMPTY_ROOT(&mm->free_tree[order]);
>>> }
>>> static void clear_reset(struct drm_buddy_block *block)
>>> @@ -70,12 +102,13 @@ static void mark_cleared(struct drm_buddy_block
>>> *block)
>>> block->header |= DRM_BUDDY_HEADER_CLEAR;
>>> }
>>> -static void mark_allocated(struct drm_buddy_block *block)
>>> +static void mark_allocated(struct drm_buddy *mm,
>>> + struct drm_buddy_block *block)
>>> {
>>> block->header &= ~DRM_BUDDY_HEADER_STATE;
>>> block->header |= DRM_BUDDY_ALLOCATED;
>>> - list_del(&block->link);
>>> + rbtree_remove(mm, block);
>>> }
>>> static void mark_free(struct drm_buddy *mm,
>>> @@ -84,15 +117,16 @@ static void mark_free(struct drm_buddy *mm,
>>> block->header &= ~DRM_BUDDY_HEADER_STATE;
>>> block->header |= DRM_BUDDY_FREE;
>>> - list_insert_sorted(mm, block);
>>> + rbtree_insert(mm, block);
>>> }
>>> -static void mark_split(struct drm_buddy_block *block)
>>> +static void mark_split(struct drm_buddy *mm,
>>> + struct drm_buddy_block *block)
>>> {
>>> block->header &= ~DRM_BUDDY_HEADER_STATE;
>>> block->header |= DRM_BUDDY_SPLIT;
>>> - list_del(&block->link);
>>> + rbtree_remove(mm, block);
>>> }
>>> static inline bool overlaps(u64 s1, u64 e1, u64 s2, u64 e2)
>>> @@ -148,7 +182,7 @@ static unsigned int __drm_buddy_free(struct
>>> drm_buddy *mm,
>>> mark_cleared(parent);
>>> }
>>> - list_del(&buddy->link);
>>> + rbtree_remove(mm, buddy);
>>> if (force_merge && drm_buddy_block_is_clear(buddy))
>>> mm->clear_avail -= drm_buddy_block_size(mm, buddy);
>>> @@ -179,12 +213,17 @@ static int __force_merge(struct drm_buddy *mm,
>>> return -EINVAL;
>>> for (i = min_order - 1; i >= 0; i--) {
>>> - struct drm_buddy_block *block, *prev;
>>> + struct drm_buddy_block *block, *prev_block, *first_block;
>>> +
>>> + first_block = rb_entry(rb_first(&mm->free_tree[i]), struct
>>> drm_buddy_block, rb);
>>> - list_for_each_entry_safe_reverse(block, prev, &mm-
>>> >free_list[i], link) {
>>> + for_each_rb_entry_reverse_safe(block, prev_block, &mm-
>>> >free_tree[i], rb) {
>>> struct drm_buddy_block *buddy;
>>> u64 block_start, block_end;
>>> + if (RB_EMPTY_NODE(&block->rb))
>>> + break;
>>
>> If we got the block from the rb tree, can it be empty here?
>
> I saw a crash earlier without this check while running graphics
> workloads, but it is not happening now. I will
>
> test with more workloads and remove it if everything looks fine.
>
>>
>>> +
>>> if (!block->parent)
>>> continue;
>>> @@ -206,10 +245,14 @@ static int __force_merge(struct drm_buddy *mm,
>>> * block in the next iteration as we would free the
>>> * buddy block as part of the free function.
>>> */
>>> - if (prev == buddy)
>>> - prev = list_prev_entry(prev, link);
>>> + if (prev_block && prev_block == buddy) {
>>> + if (prev_block != first_block)
>>> + prev_block = rb_entry(rb_prev(&prev_block->rb),
>>> + struct drm_buddy_block,
>>> + rb);
>>> + }
>>> - list_del(&block->link);
>>> + rbtree_remove(mm, block);
>>> if (drm_buddy_block_is_clear(block))
>>> mm->clear_avail -= drm_buddy_block_size(mm, block);
>>> @@ -258,14 +301,14 @@ int drm_buddy_init(struct drm_buddy *mm, u64
>>> size, u64 chunk_size)
>>> BUG_ON(mm->max_order > DRM_BUDDY_MAX_ORDER);
>>> - mm->free_list = kmalloc_array(mm->max_order + 1,
>>> - sizeof(struct list_head),
>>> + mm->free_tree = kmalloc_array(mm->max_order + 1,
>>> + sizeof(struct rb_root),
>>> GFP_KERNEL);
>>> - if (!mm->free_list)
>>> + if (!mm->free_tree)
>>> return -ENOMEM;
>>> for (i = 0; i <= mm->max_order; ++i)
>>> - INIT_LIST_HEAD(&mm->free_list[i]);
>>> + mm->free_tree[i] = RB_ROOT;
>>> mm->n_roots = hweight64(size);
>>> @@ -273,7 +316,7 @@ int drm_buddy_init(struct drm_buddy *mm, u64
>>> size, u64 chunk_size)
>>> sizeof(struct drm_buddy_block *),
>>> GFP_KERNEL);
>>> if (!mm->roots)
>>> - goto out_free_list;
>>> + goto out_free_tree;
>>> offset = 0;
>>> i = 0;
>>> @@ -312,8 +355,8 @@ int drm_buddy_init(struct drm_buddy *mm, u64
>>> size, u64 chunk_size)
>>> while (i--)
>>> drm_block_free(mm, mm->roots[i]);
>>> kfree(mm->roots);
>>> -out_free_list:
>>> - kfree(mm->free_list);
>>> +out_free_tree:
>>> + kfree(mm->free_tree);
>>> return -ENOMEM;
>>> }
>>> EXPORT_SYMBOL(drm_buddy_init);
>>> @@ -323,7 +366,7 @@ EXPORT_SYMBOL(drm_buddy_init);
>>> *
>>> * @mm: DRM buddy manager to free
>>> *
>>> - * Cleanup memory manager resources and the freelist
>>> + * Cleanup memory manager resources and the freetree
>>> */
>>> void drm_buddy_fini(struct drm_buddy *mm)
>>> {
>>> @@ -350,7 +393,7 @@ void drm_buddy_fini(struct drm_buddy *mm)
>>> WARN_ON(mm->avail != mm->size);
>>> kfree(mm->roots);
>>> - kfree(mm->free_list);
>>> + kfree(mm->free_tree);
>>> }
>>> EXPORT_SYMBOL(drm_buddy_fini);
>>> @@ -383,7 +426,7 @@ static int split_block(struct drm_buddy *mm,
>>> clear_reset(block);
>>> }
>>> - mark_split(block);
>>> + mark_split(mm, block);
>>> return 0;
>>> }
>>> @@ -598,7 +641,7 @@ get_maxblock(struct drm_buddy *mm, unsigned int
>>> order,
>>> for (i = order; i <= mm->max_order; ++i) {
>>> struct drm_buddy_block *tmp_block;
>>> - list_for_each_entry_reverse(tmp_block, &mm->free_list[i],
>>> link) {
>>> + for_each_rb_entry_reverse(tmp_block, &mm->free_tree[i], rb) {
>>> if (block_incompatible(tmp_block, flags))
>>> continue;
>>> @@ -624,7 +667,7 @@ get_maxblock(struct drm_buddy *mm, unsigned int
>>> order,
>>> }
>>> static struct drm_buddy_block *
>>> -alloc_from_freelist(struct drm_buddy *mm,
>>> +alloc_from_freetree(struct drm_buddy *mm,
>>> unsigned int order,
>>> unsigned long flags)
>>> {
>>> @@ -641,7 +684,7 @@ alloc_from_freelist(struct drm_buddy *mm,
>>> for (tmp = order; tmp <= mm->max_order; ++tmp) {
>>> struct drm_buddy_block *tmp_block;
>>> - list_for_each_entry_reverse(tmp_block, &mm-
>>> >free_list[tmp], link) {
>>> + for_each_rb_entry_reverse(tmp_block, &mm-
>>> >free_tree[tmp], rb) {
>>> if (block_incompatible(tmp_block, flags))
>>> continue;
>>> @@ -657,10 +700,8 @@ alloc_from_freelist(struct drm_buddy *mm,
>>> if (!block) {
>>> /* Fallback method */
>>> for (tmp = order; tmp <= mm->max_order; ++tmp) {
>>> - if (!list_empty(&mm->free_list[tmp])) {
>>> - block = list_last_entry(&mm->free_list[tmp],
>>> - struct drm_buddy_block,
>>> - link);
>>> + if (!rbtree_is_empty(mm, tmp)) {
>>> + block = rbtree_last_entry(mm, tmp);
>>> if (block)
>>> break;
>>> }
>>> @@ -728,7 +769,7 @@ static int __alloc_range(struct drm_buddy *mm,
>>> if (contains(start, end, block_start, block_end)) {
>>> if (drm_buddy_block_is_free(block)) {
>>> - mark_allocated(block);
>>> + mark_allocated(mm, block);
>>> total_allocated += drm_buddy_block_size(mm, block);
>>> mm->avail -= drm_buddy_block_size(mm, block);
>>> if (drm_buddy_block_is_clear(block))
>>> @@ -806,7 +847,6 @@ static int __alloc_contig_try_harder(struct
>>> drm_buddy *mm,
>>> {
>>> u64 rhs_offset, lhs_offset, lhs_size, filled;
>>> struct drm_buddy_block *block;
>>> - struct list_head *list;
>>> LIST_HEAD(blocks_lhs);
>>> unsigned long pages;
>>> unsigned int order;
>>> @@ -819,11 +859,10 @@ static int __alloc_contig_try_harder(struct
>>> drm_buddy *mm,
>>> if (order == 0)
>>> return -ENOSPC;
>>> - list = &mm->free_list[order];
>>> - if (list_empty(list))
>>> + if (rbtree_is_empty(mm, order))
>>> return -ENOSPC;
>>> - list_for_each_entry_reverse(block, list, link) {
>>> + for_each_rb_entry_reverse(block, &mm->free_tree[order], rb) {
>>> /* Allocate blocks traversing RHS */
>>> rhs_offset = drm_buddy_block_offset(block);
>>> err = __drm_buddy_alloc_range(mm, rhs_offset, size,
>>> @@ -933,7 +972,7 @@ int drm_buddy_block_trim(struct drm_buddy *mm,
>>> list_add(&block->tmp_link, &dfs);
>>> err = __alloc_range(mm, &dfs, new_start, new_size, blocks, NULL);
>>> if (err) {
>>> - mark_allocated(block);
>>> + mark_allocated(mm, block);
>>> mm->avail -= drm_buddy_block_size(mm, block);
>>> if (drm_buddy_block_is_clear(block))
>>> mm->clear_avail -= drm_buddy_block_size(mm, block);
>>> @@ -956,8 +995,8 @@ __drm_buddy_alloc_blocks(struct drm_buddy *mm,
>>> return __drm_buddy_alloc_range_bias(mm, start, end,
>>> order, flags);
>>> else
>>> - /* Allocate from freelist */
>>> - return alloc_from_freelist(mm, order, flags);
>>> + /* Allocate from freetree */
>>> + return alloc_from_freetree(mm, order, flags);
>>> }
>>> /**
>>> @@ -974,8 +1013,8 @@ __drm_buddy_alloc_blocks(struct drm_buddy *mm,
>>> * alloc_range_bias() called on range limitations, which traverses
>>> * the tree and returns the desired block.
>>> *
>>> - * alloc_from_freelist() called when *no* range restrictions
>>> - * are enforced, which picks the block from the freelist.
>>> + * alloc_from_freetree() called when *no* range restrictions
>>> + * are enforced, which picks the block from the freetree.
>>> *
>>> * Returns:
>>> * 0 on success, error code on failure.
>>> @@ -1077,7 +1116,7 @@ int drm_buddy_alloc_blocks(struct drm_buddy *mm,
>>> }
>>> } while (1);
>>> - mark_allocated(block);
>>> + mark_allocated(mm, block);
>>> mm->avail -= drm_buddy_block_size(mm, block);
>>> if (drm_buddy_block_is_clear(block))
>>> mm->clear_avail -= drm_buddy_block_size(mm, block);
>>> @@ -1161,7 +1200,7 @@ void drm_buddy_print(struct drm_buddy *mm,
>>> struct drm_printer *p)
>>> struct drm_buddy_block *block;
>>> u64 count = 0, free;
>>> - list_for_each_entry(block, &mm->free_list[order], link) {
>>> + for_each_rb_entry(block, &mm->free_tree[order], rb) {
>>> BUG_ON(!drm_buddy_block_is_free(block));
>>> count++;
>>> }
>>> diff --git a/include/drm/drm_buddy.h b/include/drm/drm_buddy.h
>>> index 9689a7c5dd36..a64d108a33b7 100644
>>> --- a/include/drm/drm_buddy.h
>>> +++ b/include/drm/drm_buddy.h
>>> @@ -10,6 +10,7 @@
>>> #include <linux/list.h>
>>> #include <linux/slab.h>
>>> #include <linux/sched.h>
>>> +#include <linux/rbtree.h>
>>> #include <drm/drm_print.h>
>>> @@ -22,6 +23,41 @@
>>> start__ >= max__ || size__ > max__ - start__; \
>>> })
>>> +/*
>>> + * for_each_rb_entry() - iterate over an RB tree in order
>>> + * @pos: the struct type * to use as a loop cursor
>>> + * @root: pointer to struct rb_root to iterate
>>> + * @member: name of the rb_node field within the struct
>>> + */
>>> +#define for_each_rb_entry(pos, root, member) \
>>> + for (pos = rb_entry_safe(rb_first(root), typeof(*pos), member); \
>>> + pos; \
>>> + pos = rb_entry_safe(rb_next(&(pos)->member), typeof(*pos),
>>> member))
>>> +
>>> +/*
>>> + * for_each_rb_entry_reverse() - iterate over an RB tree in reverse
>>> order
>>> + * @pos: the struct type * to use as a loop cursor
>>> + * @root: pointer to struct rb_root to iterate
>>> + * @member: name of the rb_node field within the struct
>>> + */
>>> +#define for_each_rb_entry_reverse(pos, root, member) \
>>> + for (pos = rb_entry_safe(rb_last(root), typeof(*pos), member); \
>>> + pos; \
>>> + pos = rb_entry_safe(rb_prev(&(pos)->member), typeof(*pos),
>>> member))
>>> +
>>> +/**
>>> + * for_each_rb_entry_reverse_safe() - safely iterate over an RB tree
>>> in reverse order
>>> + * @pos: the struct type * to use as a loop cursor.
>>> + * @n: another struct type * to use as temporary storage.
>>> + * @root: pointer to struct rb_root to iterate.
>>> + * @member: name of the rb_node field within the struct.
>>> + */
>>> +#define for_each_rb_entry_reverse_safe(pos, n, root, member) \
>>
>> Would it make sense to give these a less generic name? Something like
>> for_each_rb_free_block_* ?
>>
>> Also should this be exported or rather kept within .c?
> yes, I will change it.
>>
>>> + for (pos = rb_entry_safe(rb_last(root), typeof(*pos), member), \
>>> + n = pos ? rb_entry_safe(rb_prev(&(pos)->member),
>>> typeof(*pos), member) : NULL; \
>>> + pos; \
>>> + pos = n, n = pos ? rb_entry_safe(rb_prev(&(pos)->member),
>>> typeof(*pos), member) : NULL)
>>
>>
>>> +
>>> #define DRM_BUDDY_RANGE_ALLOCATION BIT(0)
>>> #define DRM_BUDDY_TOPDOWN_ALLOCATION BIT(1)
>>> #define DRM_BUDDY_CONTIGUOUS_ALLOCATION BIT(2)
>>> @@ -53,6 +89,7 @@ struct drm_buddy_block {
>>> * a list, if so desired. As soon as the block is freed with
>>> * drm_buddy_free* ownership is given back to the mm.
>>> */
>>> + struct rb_node rb;
>>> struct list_head link;
>>
>> I think we can be slippery here and make this a union? They should be
>> mutually exclusive AFAICT?
>
> AFAIK, we need both, rb_node rb is for handling free blocks and the
> list_head link for adding the allocated blocks into
>
> the driver's list.
At a given time I think it is either on the free rb or user block list,
not both at the same time, given that a block can't be free and
allocated? Just thinking if there is an easy way to trim the size a bit,
given that we are adding an entire rb_node, and there can be many of
these of these blocks around.
>
>>
>>> struct list_head tmp_link;
>>
>> Otherwise it should be possible to get rid of this instead, and just
>> re-use link? Could be done as separate patch, if this makes sense.
>
> I think we cannot use link here since it is needed to add the allocated
> blocks to the driver's list and tmp_link is already used in
>
> alloc_range() and alloc_range_bias().
Yeah, I don't this will work, but union might still be an option instead.
>
> Regards,
>
> Arun.
>
>>
>>> };
>>> @@ -68,7 +105,7 @@ struct drm_buddy_block {
>>> */
>>> struct drm_buddy {
>>> /* Maintain a free list for each order. */
>>> - struct list_head *free_list;
>>> + struct rb_root *free_tree;
>>> /*
>>> * Maintain explicit binary tree(s) to track the allocation of the
>>
More information about the amd-gfx
mailing list