[PATCH] drm/xe: Implement clear VRAM on free

Wed Jun 11 16:46:43 UTC 2025

On Wed, Jun 11, 2025 at 10:26:44AM -0600, Summers, Stuart wrote:
> On Tue, 2025-06-10 at 22:42 -0700, Matthew Brost wrote:
> > Clearing on free should hide latency of BO clears on new user BO
> > allocations.
> 
> Do you have any test results showing the latency improvement here on
> high volume submission tests? Also how does this impact at very large

No performance data yet available. Something we definitely should look
into and would be curious about the results. FWIW, AMDGPU implements
clear on free in a very similar way to this series. 

> buffer size submissions and mix of large and small buffer submissions
> with eviction between different processes?
> 

Eviction somewhat orthogonal to this. If an eviction occurs and it is a
new allocation a clear would still have be issued ahead handing the
memory over to the new BO, if eviction occurs and paged in BO has
backing store (it was previously evicted) we'd just copy the contents
into the allocated memory.

Matt

> > 
> > Implemented via calling xe_migrate_clear in release notify and
> > updating
> > iterator in xe_migrate_clear to skip cleared buddy blocks. Only user
> > BOs
> > cleared in release notify as kernel BOs could still be in use (e.g.,
> > PT
> > BOs need to wait for dma-resv to be idle).
> > 
> > Signed-off-by: Matthew Brost <matthew.brost at intel.com>
> > ---
> >  drivers/gpu/drm/xe/xe_bo.c           | 47
> > ++++++++++++++++++++++++++++
> >  drivers/gpu/drm/xe/xe_migrate.c      | 14 ++++++---
> >  drivers/gpu/drm/xe/xe_migrate.h      |  1 +
> >  drivers/gpu/drm/xe/xe_res_cursor.h   | 26 +++++++++++++++
> >  drivers/gpu/drm/xe/xe_ttm_vram_mgr.c |  5 ++-
> >  drivers/gpu/drm/xe/xe_ttm_vram_mgr.h |  6 ++++
> >  6 files changed, 94 insertions(+), 5 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/xe/xe_bo.c b/drivers/gpu/drm/xe/xe_bo.c
> > index 4e39188a021a..74470f4d418d 100644
> > --- a/drivers/gpu/drm/xe/xe_bo.c
> > +++ b/drivers/gpu/drm/xe/xe_bo.c
> > @@ -1434,6 +1434,51 @@ static bool
> > xe_ttm_bo_lock_in_destructor(struct ttm_buffer_object *ttm_bo)
> >         return locked;
> >  }
> >  
> > +static void xe_ttm_bo_release_clear(struct ttm_buffer_object
> > *ttm_bo)
> > +{
> > +       struct xe_device *xe = ttm_to_xe_device(ttm_bo->bdev);
> > +       struct dma_fence *fence;
> > +       int err, idx;
> > +
> > +       xe_bo_assert_held(ttm_to_xe_bo(ttm_bo));
> > +
> > +       if (ttm_bo->type != ttm_bo_type_device)
> > +               return;
> > +
> > +       if (xe_device_wedged(xe))
> > +               return;
> > +
> > +       if (!ttm_bo->resource || !mem_type_is_vram(ttm_bo->resource-
> > >mem_type))
> > +               return;
> > +
> > +       if (!drm_dev_enter(&xe->drm, &idx))
> > +               return;
> > +
> > +       if (!xe_pm_runtime_get_if_active(xe))
> > +               goto unbind;
> > +
> > +       err = dma_resv_reserve_fences(&ttm_bo->base._resv, 1);
> > +       if (err)
> > +               goto put_pm;
> > +
> > +       fence = xe_migrate_clear(mem_type_to_migrate(xe, ttm_bo-
> > >resource->mem_type),
> > +                                ttm_to_xe_bo(ttm_bo), ttm_bo-
> > >resource,
> > +                                XE_MIGRATE_CLEAR_FLAG_FULL |
> > +                                XE_MIGRATE_CLEAR_NON_DIRTY);
> > +       if (XE_WARN_ON(IS_ERR(fence)))
> > +               goto put_pm;
> > +
> > +       xe_ttm_vram_mgr_resource_set_cleared(ttm_bo->resource);
> > +       dma_resv_add_fence(&ttm_bo->base._resv, fence,
> > +                          DMA_RESV_USAGE_KERNEL);
> > +       dma_fence_put(fence);
> > +
> > +put_pm:
> > +       xe_pm_runtime_put(xe);
> > +unbind:
> > +       drm_dev_exit(idx);
> > +}
> > +
> >  static void xe_ttm_bo_release_notify(struct ttm_buffer_object
> > *ttm_bo)
> >  {
> >         struct dma_resv_iter cursor;
> > @@ -1478,6 +1523,8 @@ static void xe_ttm_bo_release_notify(struct
> > ttm_buffer_object *ttm_bo)
> >         }
> >         dma_fence_put(replacement);
> >  
> > +       xe_ttm_bo_release_clear(ttm_bo);
> > +
> >         dma_resv_unlock(ttm_bo->base.resv);
> >  }
> >  
> > diff --git a/drivers/gpu/drm/xe/xe_migrate.c
> > b/drivers/gpu/drm/xe/xe_migrate.c
> > index 8f8e9fdfb2a8..39d7200cb366 100644
> > --- a/drivers/gpu/drm/xe/xe_migrate.c
> > +++ b/drivers/gpu/drm/xe/xe_migrate.c
> > @@ -1063,7 +1063,7 @@ struct dma_fence *xe_migrate_clear(struct
> > xe_migrate *m,
> >         struct xe_gt *gt = m->tile->primary_gt;
> >         struct xe_device *xe = gt_to_xe(gt);
> >         bool clear_only_system_ccs = false;
> > -       struct dma_fence *fence = NULL;
> > +       struct dma_fence *fence = dma_fence_get_stub();
> >         u64 size = bo->size;
> >         struct xe_res_cursor src_it;
> >         struct ttm_resource *src = dst;
> > @@ -1075,10 +1075,13 @@ struct dma_fence *xe_migrate_clear(struct
> > xe_migrate *m,
> >         if (!clear_bo_data && clear_ccs && !IS_DGFX(xe))
> >                 clear_only_system_ccs = true;
> >  
> > -       if (!clear_vram)
> > +       if (!clear_vram) {
> >                 xe_res_first_sg(xe_bo_sg(bo), 0, bo->size, &src_it);
> > -       else
> > +       } else {
> >                 xe_res_first(src, 0, bo->size, &src_it);
> > +               if (!(clear_flags & XE_MIGRATE_CLEAR_NON_DIRTY))
> > +                       size -= xe_res_next_dirty(&src_it);
> > +       }
> >  
> >         while (size) {
> >                 u64 clear_L0_ofs;
> > @@ -1125,6 +1128,9 @@ struct dma_fence *xe_migrate_clear(struct
> > xe_migrate *m,
> >                         emit_pte(m, bb, clear_L0_pt, clear_vram,
> > clear_only_system_ccs,
> >                                  &src_it, clear_L0, dst);
> >  
> > +               if (clear_vram && !(clear_flags &
> > XE_MIGRATE_CLEAR_NON_DIRTY))
> > +                       size -= xe_res_next_dirty(&src_it);
> > +
> >                 bb->cs[bb->len++] = MI_BATCH_BUFFER_END;
> >                 update_idx = bb->len;
> >  
> > @@ -1146,7 +1152,7 @@ struct dma_fence *xe_migrate_clear(struct
> > xe_migrate *m,
> >                 }
> >  
> >                 xe_sched_job_add_migrate_flush(job, flush_flags);
> > -               if (!fence) {
> > +               if (fence == dma_fence_get_stub()) {
> >                         /*
> >                          * There can't be anything userspace related
> > at this
> >                          * point, so we just need to respect any
> > potential move
> > diff --git a/drivers/gpu/drm/xe/xe_migrate.h
> > b/drivers/gpu/drm/xe/xe_migrate.h
> > index fb9839c1bae0..58a7b747ef11 100644
> > --- a/drivers/gpu/drm/xe/xe_migrate.h
> > +++ b/drivers/gpu/drm/xe/xe_migrate.h
> > @@ -118,6 +118,7 @@ int xe_migrate_access_memory(struct xe_migrate
> > *m, struct xe_bo *bo,
> >  
> >  #define XE_MIGRATE_CLEAR_FLAG_BO_DATA          BIT(0)
> >  #define XE_MIGRATE_CLEAR_FLAG_CCS_DATA         BIT(1)
> > +#define XE_MIGRATE_CLEAR_NON_DIRTY             BIT(2)
> >  #define
> > XE_MIGRATE_CLEAR_FLAG_FULL     (XE_MIGRATE_CLEAR_FLAG_BO_DATA | \
> >                                         XE_MIGRATE_CLEAR_FLAG_CCS_DAT
> > A)
> >  struct dma_fence *xe_migrate_clear(struct xe_migrate *m,
> > diff --git a/drivers/gpu/drm/xe/xe_res_cursor.h
> > b/drivers/gpu/drm/xe/xe_res_cursor.h
> > index d1a403cfb628..630082e809ba 100644
> > --- a/drivers/gpu/drm/xe/xe_res_cursor.h
> > +++ b/drivers/gpu/drm/xe/xe_res_cursor.h
> > @@ -315,6 +315,32 @@ static inline void xe_res_next(struct
> > xe_res_cursor *cur, u64 size)
> >         }
> >  }
> >  
> > +/**
> > + * xe_res_next_dirty - advance the cursor to next dirty buddy block
> > + *
> > + * @cur: the cursor to advance
> > + *
> > + * Move the cursor until dirty buddy block is found.
> > + *
> > + * Return: Number of bytes cursor has been advanced
> > + */
> > +static inline u64 xe_res_next_dirty(struct xe_res_cursor *cur)
> > +{
> > +       struct drm_buddy_block *block = cur->node;
> > +       u64 bytes = 0;
> > +
> > +       XE_WARN_ON(cur->mem_type != XE_PL_VRAM0 &&
> > +                  cur->mem_type != XE_PL_VRAM1);
> 
> What if we have more than just these two? Maybe check against the mask
> instead.
> 
> > +
> > +       while (cur->remaining && drm_buddy_block_is_clear(block)) {
> > +               bytes += cur->size;
> > +               xe_res_next(cur, cur->size);
> > +               block = cur->node;
> > +       }
> > +
> > +       return bytes;
> > +}
> > +
> >  /**
> >   * xe_res_dma - return dma address of cursor at current position
> >   *
> > diff --git a/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c
> > b/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c
> > index 9e375a40aee9..120046941c1e 100644
> > --- a/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c
> > +++ b/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c
> > @@ -84,6 +84,9 @@ static int xe_ttm_vram_mgr_new(struct
> > ttm_resource_manager *man,
> >         if (place->fpfn || lpfn != man->size >> PAGE_SHIFT)
> >                 vres->flags |= DRM_BUDDY_RANGE_ALLOCATION;
> >  
> > +       if (tbo->type == ttm_bo_type_device)
> > +               vres->flags |= DRM_BUDDY_CLEAR_ALLOCATION;
> > +
> >         if (WARN_ON(!vres->base.size)) {
> >                 err = -EINVAL;
> >                 goto error_fini;
> > @@ -187,7 +190,7 @@ static void xe_ttm_vram_mgr_del(struct
> > ttm_resource_manager *man,
> >         struct drm_buddy *mm = &mgr->mm;
> >  
> >         mutex_lock(&mgr->lock);
> > -       drm_buddy_free_list(mm, &vres->blocks, 0);
> > +       drm_buddy_free_list(mm, &vres->blocks, vres->flags);
> >         mgr->visible_avail += vres->used_visible_size;
> >         mutex_unlock(&mgr->lock);
> >  
> > diff --git a/drivers/gpu/drm/xe/xe_ttm_vram_mgr.h
> > b/drivers/gpu/drm/xe/xe_ttm_vram_mgr.h
> > index cc76050e376d..dfc0e6890b3c 100644
> > --- a/drivers/gpu/drm/xe/xe_ttm_vram_mgr.h
> > +++ b/drivers/gpu/drm/xe/xe_ttm_vram_mgr.h
> > @@ -36,6 +36,12 @@ to_xe_ttm_vram_mgr_resource(struct ttm_resource
> > *res)
> >         return container_of(res, struct xe_ttm_vram_mgr_resource,
> > base);
> >  }
> >  
> > +static inline void
> > +xe_ttm_vram_mgr_resource_set_cleared(struct ttm_resource *res)
> > +{
> > +       to_xe_ttm_vram_mgr_resource(res)->flags |= DRM_BUDDY_CLEARED;
> 
> I see the amd driver is doing the same thing here. Maybe we can pull
> this in based on the flags given at the resource initialization so we
> can potentially tweak this for different scenarios (when to free) and
> converge what we're doing here and what amd is doing into the common
> code.
> 
> Thanks,
> Stuart
> 
> > +}
> > +
> >  static inline struct xe_ttm_vram_mgr *
> >  to_xe_ttm_vram_mgr(struct ttm_resource_manager *man)
> >  {
>