[PATCH v2] drm/xe: Add stats for vma page faults

Fri Feb 21 12:14:41 UTC 2025

On Mon, Feb 10, 2025 at 04:28:09PM +0000, Matthew Auld wrote:
> On 06/02/2025 13:45, Francois Dugast wrote:
> > Add new entries in stats for vma page faults. If CONFIG_DEBUG_FS is
> > enabled, the count and number of bytes can be viewed per GT in the
> > stat debugfs file. This helps when testing, to confirm page faults
> > have been triggered as expected. It also helps when looking at the
> > performance impact of page faults. Data is simply collected when
> > entering the page fault handler so there is no indication whether
> > it completed successfully, with or without retries, etc.
> > 
> > Example output:
> > 
> >      cat /sys/kernel/debug/dri/0/gt0/stats
> >      tlb_inval_count: 129
> >      vma_pagefault_count: 12
> >      vma_pagefault_bytes: 98304
> > 
> > v2: Rebase
> > 
> > Signed-off-by: Francois Dugast <francois.dugast at intel.com>
> > ---
> >   drivers/gpu/drm/xe/xe_gt_pagefault.c   | 10 +++++++---
> >   drivers/gpu/drm/xe/xe_gt_stats.c       |  2 ++
> >   drivers/gpu/drm/xe/xe_gt_stats_types.h |  2 ++
> >   3 files changed, 11 insertions(+), 3 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/xe/xe_gt_pagefault.c b/drivers/gpu/drm/xe/xe_gt_pagefault.c
> > index cb92fb5cbc75..46701ca11ce0 100644
> > --- a/drivers/gpu/drm/xe/xe_gt_pagefault.c
> > +++ b/drivers/gpu/drm/xe/xe_gt_pagefault.c
> > @@ -14,6 +14,7 @@
> >   #include "abi/guc_actions_abi.h"
> >   #include "xe_bo.h"
> >   #include "xe_gt.h"
> > +#include "xe_gt_stats.h"
> >   #include "xe_gt_tlb_invalidation.h"
> >   #include "xe_guc.h"
> >   #include "xe_guc_ct.h"
> > @@ -124,16 +125,20 @@ static int xe_pf_begin(struct drm_exec *exec, struct xe_vma *vma,
> >   	return 0;
> >   }
> > -static int handle_vma_pagefault(struct xe_tile *tile, struct pagefault *pf,
> > +static int handle_vma_pagefault(struct xe_gt *gt, struct pagefault *pf,
> >   				struct xe_vma *vma)
> >   {
> >   	struct xe_vm *vm = xe_vma_vm(vma);
> > +	struct xe_tile *tile = gt_to_tile(gt);
> >   	struct drm_exec exec;
> >   	struct dma_fence *fence;
> >   	ktime_t end = 0;
> >   	int err;
> >   	bool atomic;
> > +	xe_gt_stats_incr(gt, XE_GT_STATS_ID_VMA_PAGEFAULT_COUNT, 1);
> > +	xe_gt_stats_incr(gt, XE_GT_STATS_ID_VMA_PAGEFAULT_BYTES, xe_vma_size(vma));
> 
> Since this can be quite a large number, and a normal workload could easily
> be many GB over some number of vma, maybe we will find that we hit the
> atomic int limit quite easily (only ~2G bytes)?
> 
> Should we switch the unit over to MB/KB and also start using 64b atomic if
> we want to count stuff like this? What do you think?

Yes makes sense, let me send a follow-up.

Francois

> 
> > +
> >   	trace_xe_vma_pagefault(vma);
> >   	atomic = access_is_atomic(pf->access_type);
> > @@ -202,7 +207,6 @@ static struct xe_vm *asid_to_vm(struct xe_device *xe, u32 asid)
> >   static int handle_pagefault(struct xe_gt *gt, struct pagefault *pf)
> >   {
> >   	struct xe_device *xe = gt_to_xe(gt);
> > -	struct xe_tile *tile = gt_to_tile(gt);
> >   	struct xe_vm *vm;
> >   	struct xe_vma *vma = NULL;
> >   	int err;
> > @@ -231,7 +235,7 @@ static int handle_pagefault(struct xe_gt *gt, struct pagefault *pf)
> >   		goto unlock_vm;
> >   	}
> > -	err = handle_vma_pagefault(tile, pf, vma);
> > +	err = handle_vma_pagefault(gt, pf, vma);
> >   unlock_vm:
> >   	if (!err)
> > diff --git a/drivers/gpu/drm/xe/xe_gt_stats.c b/drivers/gpu/drm/xe/xe_gt_stats.c
> > index 7a6c1d808e41..2e9879ea4674 100644
> > --- a/drivers/gpu/drm/xe/xe_gt_stats.c
> > +++ b/drivers/gpu/drm/xe/xe_gt_stats.c
> > @@ -28,6 +28,8 @@ void xe_gt_stats_incr(struct xe_gt *gt, const enum xe_gt_stats_id id, int incr)
> >   static const char *const stat_description[__XE_GT_STATS_NUM_IDS] = {
> >   	"tlb_inval_count",
> > +	"vma_pagefault_count",
> > +	"vma_pagefault_bytes",
> >   };
> >   /**
> > diff --git a/drivers/gpu/drm/xe/xe_gt_stats_types.h b/drivers/gpu/drm/xe/xe_gt_stats_types.h
> > index 2fc055e39f27..b072bd80c4b9 100644
> > --- a/drivers/gpu/drm/xe/xe_gt_stats_types.h
> > +++ b/drivers/gpu/drm/xe/xe_gt_stats_types.h
> > @@ -8,6 +8,8 @@
> >   enum xe_gt_stats_id {
> >   	XE_GT_STATS_ID_TLB_INVAL,
> > +	XE_GT_STATS_ID_VMA_PAGEFAULT_COUNT,
> > +	XE_GT_STATS_ID_VMA_PAGEFAULT_BYTES,
> >   	/* must be the last entry */
> >   	__XE_GT_STATS_NUM_IDS,
> >   };
>