[PATCH v4 2/2] drm/xe/bo: add GPU memory trace points

Mon Jul 7 18:07:39 UTC 2025

On Thu, 2025-06-12 at 17:46 +0100, Tvrtko Ursulin wrote:
> 
> On 12/06/2025 06:40, Lucas De Marchi wrote:
> > On Wed, Jun 11, 2025 at 03:51:24PM -0700, Juston Li wrote:
> > > Add TRACE_GPU_MEM tracepoints for tracking global and per-process
> > > GPU
> > > memory usage.
> > > 
> > > These are required by VSR on Android 12+ for reporting GPU driver
> > > memory
> > > allocations.
> > > 
> > > v3:
> > > - Use now configurable CONFIG_TRACE_GPU_MEM instead of adding a
> > >   per-driver Kconfig (Lucas)
> > > 
> > > v2:
> > > - Use u64 as preferred by checkpatch (Tvrtko)
> > > - Fix errors in comments/Kconfig description (Tvrtko)
> > > - drop redundant "CONFIG" in Kconfig
> > > 
> > > Signed-off-by: Juston Li <justonli at chromium.org>
> > > Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin at igalia.com>
> > > ---
> > > drivers/gpu/drm/xe/xe_bo.c           | 47
> > > ++++++++++++++++++++++++++++
> > > drivers/gpu/drm/xe/xe_device_types.h | 16 ++++++++++
> > > 2 files changed, 63 insertions(+)
> > > 
> > > diff --git a/drivers/gpu/drm/xe/xe_bo.c
> > > b/drivers/gpu/drm/xe/xe_bo.c
> > > index 4e39188a021ab..89a3d23e3b800 100644
> > > --- a/drivers/gpu/drm/xe/xe_bo.c
> > > +++ b/drivers/gpu/drm/xe/xe_bo.c
> > > @@ -19,6 +19,8 @@
> > > 
> > > #include <kunit/static_stub.h>
> > > 
> > > +#include <trace/events/gpu_mem.h>
> > > +
> > > #include "xe_device.h"
> > > #include "xe_dma_buf.h"
> > > #include "xe_drm_client.h"
> > > @@ -418,6 +420,35 @@ static void
> > > xe_ttm_tt_account_subtract(struct 
> > > xe_device *xe, struct ttm_tt *tt)
> > >         xe_shrinker_mod_pages(xe->mem.shrinker, -(long)tt-
> > > >num_pages, 0);
> > > }
> > > 
> > > +#if IS_ENABLED(CONFIG_TRACE_GPU_MEM)
> > > +static void update_global_total_pages(struct ttm_device
> > > *ttm_dev, 
> > > long num_pages)
> > > +{
> > > +    struct xe_device *xe = ttm_to_xe_device(ttm_dev);
> > > +    u64 global_total_pages =
> > > +        atomic64_add_return(num_pages, &xe->global_total_pages);
> > > +
> > > +    trace_gpu_mem_total(0, 0, global_total_pages << PAGE_SHIFT);
> > > +}
> > > +
> > > +static void update_process_mem(struct drm_file *file, ssize_t
> > > size)
> > > +{
> > > +    struct xe_file *xef = to_xe_file(file);
> > > +    u64 process_mem = atomic64_add_return(size, &xef-
> > > >process_mem);
> > > +
> > > +    rcu_read_lock(); /* Locks file->pid! */
> > > +    trace_gpu_mem_total(0, pid_nr(rcu_dereference(file->pid)), 
> > > process_mem);
> > 
> > Isn't the first arg supposed to be the gpu id? Doesn't this become
> > invalid when I have e.g. LNL + BMG and the trace is enabled?
> 
> Good point.
> 
> u32 gpu_id does not seem possible to map to anything useful.
> 
> Shall we replace it with a string obtained from dev_name(struct
> device 
> *) ? As only Android parses them I think we can get still away with 
> changing the tracepoints ABI.
> 
> > 
> > > +    rcu_read_unlock();
> > > +}
> > > +#else
> > > +static inline void update_global_total_pages(struct ttm_device 
> > > *ttm_dev, long num_pages)
> > > +{
> > > +}
> > > +
> > > +static inline void update_process_mem(struct drm_file *file,
> > > ssize_t 
> > > size)
> > > +{
> > > +}
> > > +#endif
> > > +
> > > static struct ttm_tt *xe_ttm_tt_create(struct ttm_buffer_object
> > > *ttm_bo,
> > >                        u32 page_flags)
> > > {
> > > @@ -525,6 +556,7 @@ static int xe_ttm_tt_populate(struct
> > > ttm_device 
> > > *ttm_dev, struct ttm_tt *tt,
> > > 
> > >     xe_tt->purgeable = false;
> > >     xe_ttm_tt_account_add(ttm_to_xe_device(ttm_dev), tt);
> > > +    update_global_total_pages(ttm_dev, tt->num_pages);
> > > 
> > >     return 0;
> > > }
> > > @@ -541,6 +573,7 @@ static void xe_ttm_tt_unpopulate(struct
> > > ttm_device 
> > > *ttm_dev, struct ttm_tt *tt)
> > > 
> > >     ttm_pool_free(&ttm_dev->pool, tt);
> > >     xe_ttm_tt_account_subtract(xe, tt);
> > > +    update_global_total_pages(ttm_dev, -(long)tt->num_pages);
> > > }
> > > 
> > > static void xe_ttm_tt_destroy(struct ttm_device *ttm_dev, struct 
> > > ttm_tt *tt)
> > > @@ -1653,6 +1686,15 @@ static void xe_gem_object_free(struct 
> > > drm_gem_object *obj)
> > >     ttm_bo_put(container_of(obj, struct ttm_buffer_object,
> > > base));
> > > }
> > > 
> > > +#if IS_ENABLED(CONFIG_TRACE_GPU_MEM)
> > > +static int xe_gem_object_open(struct drm_gem_object *obj,
> > > +                  struct drm_file *file_priv)
> > > +{
> > > +    update_process_mem(file_priv, obj->size);
> > > +    return 0;
> > > +}
> > > +#endif
> > > +
> > > static void xe_gem_object_close(struct drm_gem_object *obj,
> > >                 struct drm_file *file_priv)
> > > {
> > > @@ -1665,6 +1707,8 @@ static void xe_gem_object_close(struct 
> > > drm_gem_object *obj,
> > >         ttm_bo_set_bulk_move(&bo->ttm, NULL);
> > >         xe_bo_unlock(bo);
> > >     }
> > > +
> > > +    update_process_mem(file_priv, -obj->size);
> > > }
> > > 
> > > static vm_fault_t xe_gem_fault(struct vm_fault *vmf)
> > > @@ -1762,6 +1806,9 @@ static const struct vm_operations_struct 
> > > xe_gem_vm_ops = {
> > > 
> > > static const struct drm_gem_object_funcs xe_gem_object_funcs = {
> > >     .free = xe_gem_object_free,
> > > +#if IS_ENABLED(CONFIG_TRACE_GPU_MEM)
> > > +    .open = xe_gem_object_open,
> > > +#endif
> > >     .close = xe_gem_object_close,
> > >     .mmap = drm_gem_ttm_mmap,
> > >     .export = xe_gem_prime_export,
> > > diff --git a/drivers/gpu/drm/xe/xe_device_types.h
> > > b/drivers/gpu/drm/ 
> > > xe/xe_device_types.h
> > > index e5d02a47a5287..b797e766ccbc2 100644
> > > --- a/drivers/gpu/drm/xe/xe_device_types.h
> > > +++ b/drivers/gpu/drm/xe/xe_device_types.h
> > > @@ -641,6 +641,14 @@ struct xe_device {
> > >         unsigned int fsb_freq, mem_freq, is_ddr3;
> > >     };
> > > #endif
> > > +
> > > +#if IS_ENABLED(CONFIG_TRACE_GPU_MEM)
> > > +    /**
> > > +     * @global_total_pages: global GPU page usage tracked for
> > > gpu_mem
> > > +     * tracepoints
> > > +     */
> > > +    atomic64_t global_total_pages;
> > > +#endif
> > > };
> > > 
> > > /**
> > > @@ -702,6 +710,14 @@ struct xe_file {
> > > 
> > >     /** @refcount: ref count of this xe file */
> > >     struct kref refcount;
> > > +
> > > +#if IS_ENABLED(CONFIG_TRACE_GPU_MEM)
> > > +    /**
> > > +     * @process_mem: per-process GPU memory usage tracked for
> > > gpu_mem
> > > +     * tracepoints
> > > +     */
> > > +    atomic64_t process_mem;
> > 
> > so... this is not per-process, it's actually "per dev node" open.
> > Does
> > this map correctly to the intended use and how it's handled in msm?
> 
> Per struct drm_file. Yes it is the same, both do it via 
> drm_gem_object_funcs->open/close.

Sorry this went over my head previously :)

It turns out this does cause issues; Since process_mem is getting
tracked per drm_file, a process with multiple drm_file opens will emit
traces mapping to the same PID entry causing it to overwrite each
other. Eg we actually end up showing the process_mem from the most
recent drm_file trace emitted for that PID, not the collective total of
all drm_file's for that PID...

We'd probably need to add a new field to the ABI; ~ctx_id and let
userspace combine them to get the actual process total.

For now though, it turns out the Android requirement will be satisfied
with just the global_total so for v5 I'll drop process_mem tracking and
evaluate how to add it back later as we have to worry about backwards-
compat for the gpu_trace ABI on Android.

Regards
Juston

> Regards,
> 
> Tvrtko
> 
> > 
> > Lucas De Marchi
> > 
> > > +#endif
> > > };
> > > 
> > > #endif
> > > -- 
> > > 2.50.0.rc1.591.g9c95f17f64-goog
> > >