[PATCH] drm/xe: Thread prefetch of SVM ranges
Thomas Hellström
thomas.hellstrom at linux.intel.com
Mon Jun 16 08:28:16 UTC 2025
On Sun, 2025-06-15 at 23:47 -0700, Matthew Brost wrote:
> The migrate_vma_* functions are very CPU-intensive; as a result,
> prefetching SVM ranges is limited by CPU performance rather than
> paging
> copy engine bandwidth. To accelerate SVM range prefetching, the step
> that calls migrate_vma_* is now threaded. This uses a dedicated
> workqueue, as the page fault workqueue cannot be shared without
> risking
> deadlocks—due to the prefetch IOCTL holding the VM lock in write mode
> while work items in the page fault workqueue also require the VM
> lock.
>
> The prefetch workqueue is currently allocated in GT, similar to the
> page
> fault workqueue. While this is likely not the ideal location for
> either,
> refactoring will be deferred to a later patch.
>
> Running xe_exec_system_allocator --r prefetch-benchmark, which tests
> 64MB prefetches, shows an increase from ~4.35 GB/s to 12.25 GB/s with
> this patch on drm-tip. Enabling high SLPC further increases
> throughput
> to ~15.25 GB/s, and combining SLPC with ULLS raises it to ~16 GB/s.
> Both
> of these optimizations are upcoming.
>
> v2:
> - Use dedicated prefetch workqueue
> - Pick dedicated prefetch thread count based on profiling
> - Skip threaded prefetch for only 1 range or if prefetching to SRAM
> - Fully tested
>
> Cc: Thomas Hellström <thomas.hellstrom at linux.intel.com>
> Cc: Himal Prasad Ghimiray <himal.prasad.ghimiray at intel.com>
> Signed-off-by: Matthew Brost <matthew.brost at intel.com>
Hi,
Is this really the right place to do optimizations like this?
The migration takes place in xe_svm_alloc_vram() and is being moved to
drm_pagemap_populate_mm(). If those functions are considered to be slow
then they should be optimized, rather than calling them multiple times
in parallel from an outer layer?
Before doing something like this I think we need to consider
1) Why are the migrate functions so cpu consuming? Do we have a
performance profile for it?
2) Do we actually *want* to use 5 CPU cores for this?
3) Isn't a single CPU write-combined non-temporal CPU memcopy enough to
saturate the system->VRAM bandwith?
Thanks,
Thomas
> ---
> drivers/gpu/drm/xe/xe_gt_pagefault.c | 31 ++++++-
> drivers/gpu/drm/xe/xe_gt_types.h | 2 +
> drivers/gpu/drm/xe/xe_vm.c | 128 +++++++++++++++++++++----
> --
> 3 files changed, 135 insertions(+), 26 deletions(-)
>
> diff --git a/drivers/gpu/drm/xe/xe_gt_pagefault.c
> b/drivers/gpu/drm/xe/xe_gt_pagefault.c
> index e2d975b2fddb..941cca3371f2 100644
> --- a/drivers/gpu/drm/xe/xe_gt_pagefault.c
> +++ b/drivers/gpu/drm/xe/xe_gt_pagefault.c
> @@ -400,6 +400,8 @@ static void pagefault_fini(void *arg)
>
> destroy_workqueue(gt->usm.acc_wq);
> destroy_workqueue(gt->usm.pf_wq);
> + if (gt->usm.prefetch_wq)
> + destroy_workqueue(gt->usm.prefetch_wq);
> }
>
> static int xe_alloc_pf_queue(struct xe_gt *gt, struct pf_queue
> *pf_queue)
> @@ -438,10 +440,24 @@ static int xe_alloc_pf_queue(struct xe_gt *gt,
> struct pf_queue *pf_queue)
> return 0;
> }
>
> +static int prefetch_thread_count(struct xe_device *xe)
> +{
> + if (!IS_DGFX(xe))
> + return 0;
> +
> + /*
> + * Based on profiling large aligned 2M prefetches, this is
> the optimial
> + * number of threads on BMG (only platform currently
> supported). This
> + * should be tuned for each supported platform and can
> change on per
> + * platform basis as optimizations land (e.g., large device
> pages).
> + */
> + return 5;
> +}
> +
> int xe_gt_pagefault_init(struct xe_gt *gt)
> {
> struct xe_device *xe = gt_to_xe(gt);
> - int i, ret = 0;
> + int i, count, ret = 0;
>
> if (!xe->info.has_usm)
> return 0;
> @@ -462,10 +478,23 @@ int xe_gt_pagefault_init(struct xe_gt *gt)
> if (!gt->usm.pf_wq)
> return -ENOMEM;
>
> + count = prefetch_thread_count(xe);
> + if (count) {
> + gt->usm.prefetch_wq =
> alloc_workqueue("xe_gt_prefetch_work_queue",
> + WQ_UNBOUND |
> WQ_HIGHPRI,
> + count);
> + if (!gt->usm.prefetch_wq) {
> + destroy_workqueue(gt->usm.pf_wq);
> + return -ENOMEM;
> + }
> + }
> +
> gt->usm.acc_wq =
> alloc_workqueue("xe_gt_access_counter_work_queue",
> WQ_UNBOUND | WQ_HIGHPRI,
> NUM_ACC_QUEUE);
> if (!gt->usm.acc_wq) {
> + if (gt->usm.prefetch_wq)
> + destroy_workqueue(gt->usm.prefetch_wq);
> destroy_workqueue(gt->usm.pf_wq);
> return -ENOMEM;
> }
> diff --git a/drivers/gpu/drm/xe/xe_gt_types.h
> b/drivers/gpu/drm/xe/xe_gt_types.h
> index 7def0959da35..d9ba4921b8ce 100644
> --- a/drivers/gpu/drm/xe/xe_gt_types.h
> +++ b/drivers/gpu/drm/xe/xe_gt_types.h
> @@ -239,6 +239,8 @@ struct xe_gt {
> u16 reserved_bcs_instance;
> /** @usm.pf_wq: page fault work queue, unbound, high
> priority */
> struct workqueue_struct *pf_wq;
> + /** @usm.prefetch_wq: prefetch work queue, unbound,
> high priority */
> + struct workqueue_struct *prefetch_wq;
> /** @usm.acc_wq: access counter work queue, unbound,
> high priority */
> struct workqueue_struct *acc_wq;
> /**
> diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c
> index 6ef8c4dab647..1ae8e03aead6 100644
> --- a/drivers/gpu/drm/xe/xe_vm.c
> +++ b/drivers/gpu/drm/xe/xe_vm.c
> @@ -2885,52 +2885,130 @@ static int check_ufence(struct xe_vma *vma)
> return 0;
> }
>
> -static int prefetch_ranges(struct xe_vm *vm, struct xe_vma_op *op)
> +struct prefetch_thread {
> + struct work_struct work;
> + struct drm_gpusvm_ctx *ctx;
> + struct xe_vma *vma;
> + struct xe_svm_range *svm_range;
> + struct xe_tile *tile;
> + u32 region;
> + int err;
> +};
> +
> +static void prefetch_work_func(struct work_struct *w)
> {
> - bool devmem_possible = IS_DGFX(vm->xe) &&
> IS_ENABLED(CONFIG_DRM_XE_DEVMEM_MIRROR);
> - struct xe_vma *vma = gpuva_to_vma(op->base.prefetch.va);
> + struct prefetch_thread *thread =
> + container_of(w, struct prefetch_thread, work);
> + struct xe_vma *vma = thread->vma;
> + struct xe_vm *vm = xe_vma_vm(vma);
> + struct xe_svm_range *svm_range = thread->svm_range;
> + u32 region = thread->region;
> + struct xe_tile *tile = thread->tile;
> int err = 0;
>
> - struct xe_svm_range *svm_range;
> + if (!region) {
> + xe_svm_range_migrate_to_smem(vm, svm_range);
> + } else if (xe_svm_range_needs_migrate_to_vram(svm_range,
> vma, region)) {
> + err = xe_svm_alloc_vram(vm, tile, svm_range, thread-
> >ctx);
> + if (err) {
> + drm_dbg(&vm->xe->drm,
> + "VRAM allocation failed, retry from
> userspace, asid=%u, gpusvm=%p, errno=%pe\n",
> + vm->usm.asid, &vm->svm.gpusvm,
> ERR_PTR(err));
> + thread->err = -ENODATA;
> + return;
> + }
> + xe_svm_range_debug(svm_range, "PREFETCH - RANGE
> MIGRATED TO VRAM");
> + }
> +
> + err = xe_svm_range_get_pages(vm, svm_range, thread->ctx);
> + if (err) {
> + drm_dbg(&vm->xe->drm, "Get pages failed, asid=%u,
> gpusvm=%p, errno=%pe\n",
> + vm->usm.asid, &vm->svm.gpusvm,
> ERR_PTR(err));
> + if (err == -EOPNOTSUPP || err == -EFAULT || err == -
> EPERM)
> + err = -ENODATA;
> + thread->err = err;
> + return;
> + }
> +
> + xe_svm_range_debug(svm_range, "PREFETCH - RANGE GET PAGES
> DONE");
> +}
> +
> +static int prefetch_ranges(struct xe_vm *vm, struct xe_vma_op *op)
> +{
> + struct xe_vma *vma = gpuva_to_vma(op->base.prefetch.va);
> + u32 j, region = op->prefetch_range.region;
> struct drm_gpusvm_ctx ctx = {};
> - struct xe_tile *tile;
> + struct prefetch_thread stack_thread;
> + struct xe_svm_range *svm_range;
> + struct xarray prefetches;
> + bool sram = region_to_mem_type[region] == XE_PL_TT;
> + struct xe_tile *tile = sram ? xe_device_get_root_tile(vm-
> >xe) :
> + &vm->xe->tiles[region_to_mem_type[region] -
> XE_PL_VRAM0];
> unsigned long i;
> - u32 region;
> + bool devmem_possible = IS_DGFX(vm->xe) &&
> + IS_ENABLED(CONFIG_DRM_XE_DEVMEM_MIRROR);
> + bool skip_threads = op->prefetch_range.ranges_count == 1 ||
> sram;
> + struct prefetch_thread *thread = skip_threads ?
> &stack_thread : NULL;
> + int err = 0;
>
> if (!xe_vma_is_cpu_addr_mirror(vma))
> return 0;
>
> - region = op->prefetch_range.region;
> + if (!skip_threads)
> + xa_init_flags(&prefetches, XA_FLAGS_ALLOC);
>
> ctx.read_only = xe_vma_read_only(vma);
> ctx.devmem_possible = devmem_possible;
> ctx.check_pages_threshold = devmem_possible ? SZ_64K : 0;
>
> - /* TODO: Threading the migration */
> xa_for_each(&op->prefetch_range.range, i, svm_range) {
> - if (!region)
> - xe_svm_range_migrate_to_smem(vm, svm_range);
> + if (!skip_threads) {
> + thread = kmalloc(sizeof(*thread),
> GFP_KERNEL);
> + if (!thread)
> + goto wait_threads;
>
> - if (xe_svm_range_needs_migrate_to_vram(svm_range,
> vma, region)) {
> - tile = &vm->xe-
> >tiles[region_to_mem_type[region] - XE_PL_VRAM0];
> - err = xe_svm_alloc_vram(vm, tile, svm_range,
> &ctx);
> + err = xa_alloc(&prefetches, &j, thread,
> xa_limit_32b,
> + GFP_KERNEL);
> if (err) {
> - drm_dbg(&vm->xe->drm, "VRAM
> allocation failed, retry from userspace, asid=%u, gpusvm=%p,
> errno=%pe\n",
> - vm->usm.asid, &vm-
> >svm.gpusvm, ERR_PTR(err));
> - return -ENODATA;
> + kfree(thread);
> + goto wait_threads;
> }
> - xe_svm_range_debug(svm_range, "PREFETCH -
> RANGE MIGRATED TO VRAM");
> }
>
> - err = xe_svm_range_get_pages(vm, svm_range, &ctx);
> - if (err) {
> - drm_dbg(&vm->xe->drm, "Get pages failed,
> asid=%u, gpusvm=%p, errno=%pe\n",
> - vm->usm.asid, &vm->svm.gpusvm,
> ERR_PTR(err));
> - if (err == -EOPNOTSUPP || err == -EFAULT ||
> err == -EPERM)
> - err = -ENODATA;
> - return err;
> + INIT_WORK(&thread->work, prefetch_work_func);
> + thread->ctx = &ctx;
> + thread->vma = vma;
> + thread->svm_range = svm_range;
> + thread->tile = tile;
> + thread->region = region;
> + thread->err = 0;
> +
> + if (skip_threads) {
> + prefetch_work_func(&thread->work);
> + if (thread->err)
> + return thread->err;
> + } else {
> + /*
> + * Prefetch uses a dedicated workqueue, as
> the page
> + * fault workqueue cannot be shared without
> risking
> + * deadlocks—due to holding the VM lock in
> write mode
> + * here while work items in the page fault
> workqueue
> + * also require the VM lock.
> + */
> + queue_work(tile->primary_gt-
> >usm.prefetch_wq,
> + &thread->work);
> + }
> + }
> +
> +wait_threads:
> + if (!skip_threads) {
> + xa_for_each(&prefetches, i, thread) {
> + flush_work(&thread->work);
> + if (thread->err && (!err || err == -
> ENODATA))
> + err = thread->err;
> + kfree(thread);
> }
> - xe_svm_range_debug(svm_range, "PREFETCH - RANGE GET
> PAGES DONE");
> + xa_destroy(&prefetches);
> }
>
> return err;
More information about the Intel-xe
mailing list