[PATCH v2 1/2] drm/buddy: Optimize free block management with RB tree
Arunpravin Paneer Selvam
arunpravin.paneerselvam at amd.com
Wed Aug 20 10:37:16 UTC 2025
On 8/20/2025 3:44 PM, Matthew Auld wrote:
> On 20/08/2025 08:56, Arunpravin Paneer Selvam wrote:
>> Hi Matthew,
>>
>> On 8/14/2025 4:15 PM, Matthew Auld wrote:
>>> On 24/07/2025 11:46, Arunpravin Paneer Selvam wrote:
>>>> Replace the freelist (O(n)) used for free block management with a
>>>> red-black tree, providing more efficient O(log n) search, insert,
>>>> and delete operations. This improves scalability and performance
>>>> when managing large numbers of free blocks per order (e.g., hundreds
>>>> or thousands).
>>>>
>>>> In the VK-CTS memory stress subtest, the buddy manager merges
>>>> fragmented memory and inserts freed blocks into the freelist. Since
>>>> freelist insertion is O(n), this becomes a bottleneck as fragmentation
>>>> increases. Benchmarking shows list_insert_sorted() consumes ~52.69%
>>>> CPU
>>>> with the freelist, compared to just 0.03% with the RB tree
>>>> (rbtree_insert.isra.0), despite performing the same sorted insert.
>>>>
>>>> This also improves performance in heavily fragmented workloads,
>>>> such as games or graphics tests that stress memory.
>>>
>>> Neat. Also please Cc intel-gfx at lists.freedesktop.org and intel-
>>> xe at lists.freedesktop.org on the next revision so our CI can pick
>>> this up.
>> Sure, I will add on v3.
>>>
>>>>
>>>> Signed-off-by: Arunpravin Paneer Selvam
>>>> <Arunpravin.PaneerSelvam at amd.com>
>>>> ---
>>>> drivers/gpu/drm/drm_buddy.c | 141
>>>> +++++++++++++++++++++++-------------
>>>> include/drm/drm_buddy.h | 39 +++++++++-
>>>> 2 files changed, 128 insertions(+), 52 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c
>>>> index a1e652b7631d..19e9773b41be 100644
>>>> --- a/drivers/gpu/drm/drm_buddy.c
>>>> +++ b/drivers/gpu/drm/drm_buddy.c
>>>> @@ -31,6 +31,8 @@ static struct drm_buddy_block
>>>> *drm_block_alloc(struct drm_buddy *mm,
>>>> block->header |= order;
>>>> block->parent = parent;
>>>> + RB_CLEAR_NODE(&block->rb);
>>>> +
>>>> BUG_ON(block->header & DRM_BUDDY_HEADER_UNUSED);
>>>> return block;
>>>> }
>>>> @@ -41,23 +43,53 @@ static void drm_block_free(struct drm_buddy *mm,
>>>> kmem_cache_free(slab_blocks, block);
>>>> }
>>>> -static void list_insert_sorted(struct drm_buddy *mm,
>>>> - struct drm_buddy_block *block)
>>>> +static void rbtree_insert(struct drm_buddy *mm,
>>>> + struct drm_buddy_block *block)
>>>> {
>>>> + struct rb_root *root = &mm-
>>>> >free_tree[drm_buddy_block_order(block)];
>>>> + struct rb_node **link = &root->rb_node;
>>>> + struct rb_node *parent = NULL;
>>>> struct drm_buddy_block *node;
>>>> - struct list_head *head;
>>>> + u64 offset;
>>>> +
>>>> + offset = drm_buddy_block_offset(block);
>>>> - head = &mm->free_list[drm_buddy_block_order(block)];
>>>> - if (list_empty(head)) {
>>>> - list_add(&block->link, head);
>>>> - return;
>>>> + while (*link) {
>>>> + parent = *link;
>>>> + node = rb_entry(parent, struct drm_buddy_block, rb);
>>>> +
>>>> + if (offset < drm_buddy_block_offset(node))
>>>> + link = &parent->rb_left;
>>>> + else
>>>> + link = &parent->rb_right;
>>>> }
>>>> - list_for_each_entry(node, head, link)
>>>> - if (drm_buddy_block_offset(block) <
>>>> drm_buddy_block_offset(node))
>>>> - break;
>>>> + rb_link_node(&block->rb, parent, link);
>>>> + rb_insert_color(&block->rb, root);
>>>> +}
>>>> +
>>>> +static void rbtree_remove(struct drm_buddy *mm,
>>>> + struct drm_buddy_block *block)
>>>> +{
>>>> + struct rb_root *root;
>>>> - __list_add(&block->link, node->link.prev, &node->link);
>>>> + root = &mm->free_tree[drm_buddy_block_order(block)];
>>>> + rb_erase(&block->rb, root);
>>>> +
>>>> + RB_CLEAR_NODE(&block->rb);
>>>> +}
>>>> +
>>>> +static inline struct drm_buddy_block *
>>>> +rbtree_last_entry(struct drm_buddy *mm, unsigned int order)
>>>> +{
>>>> + struct rb_node *node = rb_last(&mm->free_tree[order]);
>>>> +
>>>> + return node ? rb_entry(node, struct drm_buddy_block, rb) : NULL;
>>>> +}
>>>> +
>>>> +static bool rbtree_is_empty(struct drm_buddy *mm, unsigned int order)
>>>> +{
>>>> + return RB_EMPTY_ROOT(&mm->free_tree[order]);
>>>> }
>>>> static void clear_reset(struct drm_buddy_block *block)
>>>> @@ -70,12 +102,13 @@ static void mark_cleared(struct
>>>> drm_buddy_block *block)
>>>> block->header |= DRM_BUDDY_HEADER_CLEAR;
>>>> }
>>>> -static void mark_allocated(struct drm_buddy_block *block)
>>>> +static void mark_allocated(struct drm_buddy *mm,
>>>> + struct drm_buddy_block *block)
>>>> {
>>>> block->header &= ~DRM_BUDDY_HEADER_STATE;
>>>> block->header |= DRM_BUDDY_ALLOCATED;
>>>> - list_del(&block->link);
>>>> + rbtree_remove(mm, block);
>>>> }
>>>> static void mark_free(struct drm_buddy *mm,
>>>> @@ -84,15 +117,16 @@ static void mark_free(struct drm_buddy *mm,
>>>> block->header &= ~DRM_BUDDY_HEADER_STATE;
>>>> block->header |= DRM_BUDDY_FREE;
>>>> - list_insert_sorted(mm, block);
>>>> + rbtree_insert(mm, block);
>>>> }
>>>> -static void mark_split(struct drm_buddy_block *block)
>>>> +static void mark_split(struct drm_buddy *mm,
>>>> + struct drm_buddy_block *block)
>>>> {
>>>> block->header &= ~DRM_BUDDY_HEADER_STATE;
>>>> block->header |= DRM_BUDDY_SPLIT;
>>>> - list_del(&block->link);
>>>> + rbtree_remove(mm, block);
>>>> }
>>>> static inline bool overlaps(u64 s1, u64 e1, u64 s2, u64 e2)
>>>> @@ -148,7 +182,7 @@ static unsigned int __drm_buddy_free(struct
>>>> drm_buddy *mm,
>>>> mark_cleared(parent);
>>>> }
>>>> - list_del(&buddy->link);
>>>> + rbtree_remove(mm, buddy);
>>>> if (force_merge && drm_buddy_block_is_clear(buddy))
>>>> mm->clear_avail -= drm_buddy_block_size(mm, buddy);
>>>> @@ -179,12 +213,17 @@ static int __force_merge(struct drm_buddy *mm,
>>>> return -EINVAL;
>>>> for (i = min_order - 1; i >= 0; i--) {
>>>> - struct drm_buddy_block *block, *prev;
>>>> + struct drm_buddy_block *block, *prev_block, *first_block;
>>>> +
>>>> + first_block = rb_entry(rb_first(&mm->free_tree[i]), struct
>>>> drm_buddy_block, rb);
>>>> - list_for_each_entry_safe_reverse(block, prev, &mm-
>>>> >free_list[i], link) {
>>>> + for_each_rb_entry_reverse_safe(block, prev_block, &mm-
>>>> >free_tree[i], rb) {
>>>> struct drm_buddy_block *buddy;
>>>> u64 block_start, block_end;
>>>> + if (RB_EMPTY_NODE(&block->rb))
>>>> + break;
>>>
>>> If we got the block from the rb tree, can it be empty here?
>>
>> I saw a crash earlier without this check while running graphics
>> workloads, but it is not happening now. I will
>>
>> test with more workloads and remove it if everything looks fine.
>>
>>>
>>>> +
>>>> if (!block->parent)
>>>> continue;
>>>> @@ -206,10 +245,14 @@ static int __force_merge(struct drm_buddy *mm,
>>>> * block in the next iteration as we would free the
>>>> * buddy block as part of the free function.
>>>> */
>>>> - if (prev == buddy)
>>>> - prev = list_prev_entry(prev, link);
>>>> + if (prev_block && prev_block == buddy) {
>>>> + if (prev_block != first_block)
>>>> + prev_block = rb_entry(rb_prev(&prev_block->rb),
>>>> + struct drm_buddy_block,
>>>> + rb);
>>>> + }
>>>> - list_del(&block->link);
>>>> + rbtree_remove(mm, block);
>>>> if (drm_buddy_block_is_clear(block))
>>>> mm->clear_avail -= drm_buddy_block_size(mm, block);
>>>> @@ -258,14 +301,14 @@ int drm_buddy_init(struct drm_buddy *mm,
>>>> u64 size, u64 chunk_size)
>>>> BUG_ON(mm->max_order > DRM_BUDDY_MAX_ORDER);
>>>> - mm->free_list = kmalloc_array(mm->max_order + 1,
>>>> - sizeof(struct list_head),
>>>> + mm->free_tree = kmalloc_array(mm->max_order + 1,
>>>> + sizeof(struct rb_root),
>>>> GFP_KERNEL);
>>>> - if (!mm->free_list)
>>>> + if (!mm->free_tree)
>>>> return -ENOMEM;
>>>> for (i = 0; i <= mm->max_order; ++i)
>>>> - INIT_LIST_HEAD(&mm->free_list[i]);
>>>> + mm->free_tree[i] = RB_ROOT;
>>>> mm->n_roots = hweight64(size);
>>>> @@ -273,7 +316,7 @@ int drm_buddy_init(struct drm_buddy *mm, u64
>>>> size, u64 chunk_size)
>>>> sizeof(struct drm_buddy_block *),
>>>> GFP_KERNEL);
>>>> if (!mm->roots)
>>>> - goto out_free_list;
>>>> + goto out_free_tree;
>>>> offset = 0;
>>>> i = 0;
>>>> @@ -312,8 +355,8 @@ int drm_buddy_init(struct drm_buddy *mm, u64
>>>> size, u64 chunk_size)
>>>> while (i--)
>>>> drm_block_free(mm, mm->roots[i]);
>>>> kfree(mm->roots);
>>>> -out_free_list:
>>>> - kfree(mm->free_list);
>>>> +out_free_tree:
>>>> + kfree(mm->free_tree);
>>>> return -ENOMEM;
>>>> }
>>>> EXPORT_SYMBOL(drm_buddy_init);
>>>> @@ -323,7 +366,7 @@ EXPORT_SYMBOL(drm_buddy_init);
>>>> *
>>>> * @mm: DRM buddy manager to free
>>>> *
>>>> - * Cleanup memory manager resources and the freelist
>>>> + * Cleanup memory manager resources and the freetree
>>>> */
>>>> void drm_buddy_fini(struct drm_buddy *mm)
>>>> {
>>>> @@ -350,7 +393,7 @@ void drm_buddy_fini(struct drm_buddy *mm)
>>>> WARN_ON(mm->avail != mm->size);
>>>> kfree(mm->roots);
>>>> - kfree(mm->free_list);
>>>> + kfree(mm->free_tree);
>>>> }
>>>> EXPORT_SYMBOL(drm_buddy_fini);
>>>> @@ -383,7 +426,7 @@ static int split_block(struct drm_buddy *mm,
>>>> clear_reset(block);
>>>> }
>>>> - mark_split(block);
>>>> + mark_split(mm, block);
>>>> return 0;
>>>> }
>>>> @@ -598,7 +641,7 @@ get_maxblock(struct drm_buddy *mm, unsigned int
>>>> order,
>>>> for (i = order; i <= mm->max_order; ++i) {
>>>> struct drm_buddy_block *tmp_block;
>>>> - list_for_each_entry_reverse(tmp_block,
>>>> &mm->free_list[i], link) {
>>>> + for_each_rb_entry_reverse(tmp_block, &mm->free_tree[i], rb) {
>>>> if (block_incompatible(tmp_block, flags))
>>>> continue;
>>>> @@ -624,7 +667,7 @@ get_maxblock(struct drm_buddy *mm, unsigned
>>>> int order,
>>>> }
>>>> static struct drm_buddy_block *
>>>> -alloc_from_freelist(struct drm_buddy *mm,
>>>> +alloc_from_freetree(struct drm_buddy *mm,
>>>> unsigned int order,
>>>> unsigned long flags)
>>>> {
>>>> @@ -641,7 +684,7 @@ alloc_from_freelist(struct drm_buddy *mm,
>>>> for (tmp = order; tmp <= mm->max_order; ++tmp) {
>>>> struct drm_buddy_block *tmp_block;
>>>> - list_for_each_entry_reverse(tmp_block, &mm-
>>>> >free_list[tmp], link) {
>>>> + for_each_rb_entry_reverse(tmp_block, &mm-
>>>> >free_tree[tmp], rb) {
>>>> if (block_incompatible(tmp_block, flags))
>>>> continue;
>>>> @@ -657,10 +700,8 @@ alloc_from_freelist(struct drm_buddy *mm,
>>>> if (!block) {
>>>> /* Fallback method */
>>>> for (tmp = order; tmp <= mm->max_order; ++tmp) {
>>>> - if (!list_empty(&mm->free_list[tmp])) {
>>>> - block = list_last_entry(&mm->free_list[tmp],
>>>> - struct drm_buddy_block,
>>>> - link);
>>>> + if (!rbtree_is_empty(mm, tmp)) {
>>>> + block = rbtree_last_entry(mm, tmp);
>>>> if (block)
>>>> break;
>>>> }
>>>> @@ -728,7 +769,7 @@ static int __alloc_range(struct drm_buddy *mm,
>>>> if (contains(start, end, block_start, block_end)) {
>>>> if (drm_buddy_block_is_free(block)) {
>>>> - mark_allocated(block);
>>>> + mark_allocated(mm, block);
>>>> total_allocated += drm_buddy_block_size(mm, block);
>>>> mm->avail -= drm_buddy_block_size(mm, block);
>>>> if (drm_buddy_block_is_clear(block))
>>>> @@ -806,7 +847,6 @@ static int __alloc_contig_try_harder(struct
>>>> drm_buddy *mm,
>>>> {
>>>> u64 rhs_offset, lhs_offset, lhs_size, filled;
>>>> struct drm_buddy_block *block;
>>>> - struct list_head *list;
>>>> LIST_HEAD(blocks_lhs);
>>>> unsigned long pages;
>>>> unsigned int order;
>>>> @@ -819,11 +859,10 @@ static int __alloc_contig_try_harder(struct
>>>> drm_buddy *mm,
>>>> if (order == 0)
>>>> return -ENOSPC;
>>>> - list = &mm->free_list[order];
>>>> - if (list_empty(list))
>>>> + if (rbtree_is_empty(mm, order))
>>>> return -ENOSPC;
>>>> - list_for_each_entry_reverse(block, list, link) {
>>>> + for_each_rb_entry_reverse(block, &mm->free_tree[order], rb) {
>>>> /* Allocate blocks traversing RHS */
>>>> rhs_offset = drm_buddy_block_offset(block);
>>>> err = __drm_buddy_alloc_range(mm, rhs_offset, size,
>>>> @@ -933,7 +972,7 @@ int drm_buddy_block_trim(struct drm_buddy *mm,
>>>> list_add(&block->tmp_link, &dfs);
>>>> err = __alloc_range(mm, &dfs, new_start, new_size, blocks,
>>>> NULL);
>>>> if (err) {
>>>> - mark_allocated(block);
>>>> + mark_allocated(mm, block);
>>>> mm->avail -= drm_buddy_block_size(mm, block);
>>>> if (drm_buddy_block_is_clear(block))
>>>> mm->clear_avail -= drm_buddy_block_size(mm, block);
>>>> @@ -956,8 +995,8 @@ __drm_buddy_alloc_blocks(struct drm_buddy *mm,
>>>> return __drm_buddy_alloc_range_bias(mm, start, end,
>>>> order, flags);
>>>> else
>>>> - /* Allocate from freelist */
>>>> - return alloc_from_freelist(mm, order, flags);
>>>> + /* Allocate from freetree */
>>>> + return alloc_from_freetree(mm, order, flags);
>>>> }
>>>> /**
>>>> @@ -974,8 +1013,8 @@ __drm_buddy_alloc_blocks(struct drm_buddy *mm,
>>>> * alloc_range_bias() called on range limitations, which traverses
>>>> * the tree and returns the desired block.
>>>> *
>>>> - * alloc_from_freelist() called when *no* range restrictions
>>>> - * are enforced, which picks the block from the freelist.
>>>> + * alloc_from_freetree() called when *no* range restrictions
>>>> + * are enforced, which picks the block from the freetree.
>>>> *
>>>> * Returns:
>>>> * 0 on success, error code on failure.
>>>> @@ -1077,7 +1116,7 @@ int drm_buddy_alloc_blocks(struct drm_buddy *mm,
>>>> }
>>>> } while (1);
>>>> - mark_allocated(block);
>>>> + mark_allocated(mm, block);
>>>> mm->avail -= drm_buddy_block_size(mm, block);
>>>> if (drm_buddy_block_is_clear(block))
>>>> mm->clear_avail -= drm_buddy_block_size(mm, block);
>>>> @@ -1161,7 +1200,7 @@ void drm_buddy_print(struct drm_buddy *mm,
>>>> struct drm_printer *p)
>>>> struct drm_buddy_block *block;
>>>> u64 count = 0, free;
>>>> - list_for_each_entry(block, &mm->free_list[order], link) {
>>>> + for_each_rb_entry(block, &mm->free_tree[order], rb) {
>>>> BUG_ON(!drm_buddy_block_is_free(block));
>>>> count++;
>>>> }
>>>> diff --git a/include/drm/drm_buddy.h b/include/drm/drm_buddy.h
>>>> index 9689a7c5dd36..a64d108a33b7 100644
>>>> --- a/include/drm/drm_buddy.h
>>>> +++ b/include/drm/drm_buddy.h
>>>> @@ -10,6 +10,7 @@
>>>> #include <linux/list.h>
>>>> #include <linux/slab.h>
>>>> #include <linux/sched.h>
>>>> +#include <linux/rbtree.h>
>>>> #include <drm/drm_print.h>
>>>> @@ -22,6 +23,41 @@
>>>> start__ >= max__ || size__ > max__ - start__; \
>>>> })
>>>> +/*
>>>> + * for_each_rb_entry() - iterate over an RB tree in order
>>>> + * @pos: the struct type * to use as a loop cursor
>>>> + * @root: pointer to struct rb_root to iterate
>>>> + * @member: name of the rb_node field within the struct
>>>> + */
>>>> +#define for_each_rb_entry(pos, root, member) \
>>>> + for (pos = rb_entry_safe(rb_first(root), typeof(*pos), member); \
>>>> + pos; \
>>>> + pos = rb_entry_safe(rb_next(&(pos)->member),
>>>> typeof(*pos), member))
>>>> +
>>>> +/*
>>>> + * for_each_rb_entry_reverse() - iterate over an RB tree in
>>>> reverse order
>>>> + * @pos: the struct type * to use as a loop cursor
>>>> + * @root: pointer to struct rb_root to iterate
>>>> + * @member: name of the rb_node field within the struct
>>>> + */
>>>> +#define for_each_rb_entry_reverse(pos, root, member) \
>>>> + for (pos = rb_entry_safe(rb_last(root), typeof(*pos), member); \
>>>> + pos; \
>>>> + pos = rb_entry_safe(rb_prev(&(pos)->member),
>>>> typeof(*pos), member))
>>>> +
>>>> +/**
>>>> + * for_each_rb_entry_reverse_safe() - safely iterate over an RB
>>>> tree in reverse order
>>>> + * @pos: the struct type * to use as a loop cursor.
>>>> + * @n: another struct type * to use as temporary storage.
>>>> + * @root: pointer to struct rb_root to iterate.
>>>> + * @member: name of the rb_node field within the struct.
>>>> + */
>>>> +#define for_each_rb_entry_reverse_safe(pos, n, root, member) \
>>>
>>> Would it make sense to give these a less generic name? Something
>>> like for_each_rb_free_block_* ?
>>>
>>> Also should this be exported or rather kept within .c?
>> yes, I will change it.
>>>
>>>> + for (pos = rb_entry_safe(rb_last(root), typeof(*pos), member), \
>>>> + n = pos ? rb_entry_safe(rb_prev(&(pos)->member),
>>>> typeof(*pos), member) : NULL; \
>>>> + pos; \
>>>> + pos = n, n = pos ? rb_entry_safe(rb_prev(&(pos)->member),
>>>> typeof(*pos), member) : NULL)
>>>
>>>
>>>> +
>>>> #define DRM_BUDDY_RANGE_ALLOCATION BIT(0)
>>>> #define DRM_BUDDY_TOPDOWN_ALLOCATION BIT(1)
>>>> #define DRM_BUDDY_CONTIGUOUS_ALLOCATION BIT(2)
>>>> @@ -53,6 +89,7 @@ struct drm_buddy_block {
>>>> * a list, if so desired. As soon as the block is freed with
>>>> * drm_buddy_free* ownership is given back to the mm.
>>>> */
>>>> + struct rb_node rb;
>>>> struct list_head link;
>>>
>>> I think we can be slippery here and make this a union? They should
>>> be mutually exclusive AFAICT?
>>
>> AFAIK, we need both, rb_node rb is for handling free blocks and the
>> list_head link for adding the allocated blocks into
>>
>> the driver's list.
>
> At a given time I think it is either on the free rb or user block
> list, not both at the same time, given that a block can't be free and
> allocated? Just thinking if there is an easy way to trim the size a
> bit, given that we are adding an entire rb_node, and there can be many
> of these of these blocks around.
yes, you are right. I will add union for free rb and user block list.
>
>>
>>>
>>>> struct list_head tmp_link;
>>>
>>> Otherwise it should be possible to get rid of this instead, and just
>>> re-use link? Could be done as separate patch, if this makes sense.
>>
>> I think we cannot use link here since it is needed to add the
>> allocated blocks to the driver's list and tmp_link is already used in
>>
>> alloc_range() and alloc_range_bias().
>
> Yeah, I don't this will work, but union might still be an option instead.
sure, we can make union for rb and list.
Regards,
Arun.
>
>
>>
>> Regards,
>>
>> Arun.
>>
>>>
>>>> };
>>>> @@ -68,7 +105,7 @@ struct drm_buddy_block {
>>>> */
>>>> struct drm_buddy {
>>>> /* Maintain a free list for each order. */
>>>> - struct list_head *free_list;
>>>> + struct rb_root *free_tree;
>>>> /*
>>>> * Maintain explicit binary tree(s) to track the allocation
>>>> of the
>>>
>
More information about the amd-gfx
mailing list