[PATCH v4 5/5] drm/xe: Make the PT code handle placement per PTE rather than per vma / range
Ghimiray, Himal Prasad
himal.prasad.ghimiray at intel.com
Wed Mar 26 09:48:19 UTC 2025
On 26-03-2025 13:35, Thomas Hellström wrote:
> With SVM, ranges forwarded to the PT code for binding can, mostly
> due to races when migrating, point to both VRAM and system / foreign
> device memory. Make the PT code able to handle that by checking,
> for each PTE set up, whether it points to local VRAM or to system
> memory.
>
> v2:
> - Fix system memory GPU atomic access.
> v3:
> - Avoid the UAPI change. It needs more thought.
>
> Signed-off-by: Thomas Hellström <thomas.hellstrom at linux.intel.com>
> Reviewed-by: Matthew Brost <matthew.brost at intel.com>
> ---
> drivers/gpu/drm/xe/xe_bo.c | 12 +++-
> drivers/gpu/drm/xe/xe_pt.c | 120 +++++++++++++++++++------------------
> 2 files changed, 70 insertions(+), 62 deletions(-)
>
> diff --git a/drivers/gpu/drm/xe/xe_bo.c b/drivers/gpu/drm/xe/xe_bo.c
> index 9d043d2c30fd..3c7c2353d3c8 100644
> --- a/drivers/gpu/drm/xe/xe_bo.c
> +++ b/drivers/gpu/drm/xe/xe_bo.c
> @@ -2116,10 +2116,16 @@ uint64_t vram_region_gpu_offset(struct ttm_resource *res)
> {
> struct xe_device *xe = ttm_to_xe_device(res->bo->bdev);
>
> - if (res->mem_type == XE_PL_STOLEN)
> + switch (res->mem_type) {
> + case XE_PL_STOLEN:
> return xe_ttm_stolen_gpu_offset(xe);
> -
> - return res_to_mem_region(res)->dpa_base;
> + case XE_PL_TT:
> + case XE_PL_SYSTEM:
> + return 0;
> + default:
> + return res_to_mem_region(res)->dpa_base;
> + }
> + return 0;
> }
>
> /**
> diff --git a/drivers/gpu/drm/xe/xe_pt.c b/drivers/gpu/drm/xe/xe_pt.c
> index 9e719535a3bb..82ae159feed1 100644
> --- a/drivers/gpu/drm/xe/xe_pt.c
> +++ b/drivers/gpu/drm/xe/xe_pt.c
> @@ -278,13 +278,15 @@ struct xe_pt_stage_bind_walk {
> struct xe_vm *vm;
> /** @tile: The tile we're building for. */
> struct xe_tile *tile;
> - /** @default_pte: PTE flag only template. No address is associated */
> - u64 default_pte;
> + /** @default_pte: PTE flag only template for VRAM. No address is associated */
> + u64 default_vram_pte;
> + /** @default_pte: PTE flag only template for VRAM. No address is associated */
> + u64 default_system_pte;
> /** @dma_offset: DMA offset to add to the PTE. */
> u64 dma_offset;
> /**
> * @needs_64k: This address range enforces 64K alignment and
> - * granularity.
> + * granularity on VRAM.
> */
> bool needs_64K;
> /**
> @@ -515,13 +517,16 @@ xe_pt_stage_bind_entry(struct xe_ptw *parent, pgoff_t offset,
> if (level == 0 || xe_pt_hugepte_possible(addr, next, level, xe_walk)) {
> struct xe_res_cursor *curs = xe_walk->curs;
> bool is_null = xe_vma_is_null(xe_walk->vma);
> + bool is_vram = is_null ? false : xe_res_is_vram(curs);
>
> XE_WARN_ON(xe_walk->va_curs_start != addr);
>
> pte = vm->pt_ops->pte_encode_vma(is_null ? 0 :
> xe_res_dma(curs) + xe_walk->dma_offset,
> xe_walk->vma, pat_index, level);
> - pte |= xe_walk->default_pte;
> + if (!is_null)
> + pte |= is_vram ? xe_walk->default_vram_pte :
> + xe_walk->default_system_pte;
>
> /*
> * Set the XE_PTE_PS64 hint if possible, otherwise if
> @@ -531,7 +536,7 @@ xe_pt_stage_bind_entry(struct xe_ptw *parent, pgoff_t offset,
> if (xe_pt_is_pte_ps64K(addr, next, xe_walk)) {
> xe_walk->vma->gpuva.flags |= XE_VMA_PTE_64K;
> pte |= XE_PTE_PS64;
> - } else if (XE_WARN_ON(xe_walk->needs_64K)) {
> + } else if (XE_WARN_ON(xe_walk->needs_64K && is_vram)) {
> return -EINVAL;
> }
> }
> @@ -603,6 +608,44 @@ static const struct xe_pt_walk_ops xe_pt_stage_bind_ops = {
> .pt_entry = xe_pt_stage_bind_entry,
> };
>
> +/*
> + * Default atomic expectations for different allocation scenarios are as follows:
> + *
> + * 1. Traditional API: When the VM is not in LR mode:
> + * - Device atomics are expected to function with all allocations.
> + *
> + * 2. Compute/SVM API: When the VM is in LR mode:
> + * - Device atomics are the default behavior when the bo is placed in a single region.
> + * - In all other cases device atomics will be disabled with AE=0 until an application
> + * request differently using a ioctl like madvise.
> + */
> +static bool xe_atomic_for_vram(struct xe_vm *vm)
> +{
> + return true;
> +}
> +
> +static bool xe_atomic_for_system(struct xe_vm *vm, struct xe_bo *bo)
> +{
> + struct xe_device *xe = vm->xe;
> +
> + if (!xe->info.has_device_atomics_on_smem)
> + return false;
> +
> + /*
> + * If a SMEM+LMEM allocation is backed by SMEM, a device
> + * atomics will cause a gpu page fault and which then
> + * gets migrated to LMEM, bind such allocations with
> + * device atomics enabled.
> + *
> + * TODO: Revisit this. Perhaps add something like a
> + * fault_on_atomics_in_system UAPI flag.
> + * Note that this also prohibits GPU atomics in LR mode for
> + * userptr and system memory on DGFX.
> + */
> + return (!IS_DGFX(xe) || (!xe_vm_in_lr_mode(vm) ||
> + (bo && xe_bo_has_single_placement(bo))));
> +}
> +
> /**
> * xe_pt_stage_bind() - Build a disconnected page-table tree for a given address
> * range.
> @@ -629,9 +672,8 @@ xe_pt_stage_bind(struct xe_tile *tile, struct xe_vma *vma,
> {
> struct xe_device *xe = tile_to_xe(tile);
> struct xe_bo *bo = xe_vma_bo(vma);
> - bool is_devmem = !xe_vma_is_userptr(vma) && bo &&
> - (xe_bo_is_vram(bo) || xe_bo_is_stolen_devmem(bo));
> struct xe_res_cursor curs;
> + struct xe_vm *vm = xe_vma_vm(vma);
> struct xe_pt_stage_bind_walk xe_walk = {
> .base = {
> .ops = &xe_pt_stage_bind_ops,
> @@ -639,7 +681,7 @@ xe_pt_stage_bind(struct xe_tile *tile, struct xe_vma *vma,
> .max_level = XE_PT_HIGHEST_LEVEL,
> .staging = true,
> },
> - .vm = xe_vma_vm(vma),
> + .vm = vm,
> .tile = tile,
> .curs = &curs,
> .va_curs_start = range ? range->base.itree.start :
> @@ -647,26 +689,22 @@ xe_pt_stage_bind(struct xe_tile *tile, struct xe_vma *vma,
> .vma = vma,
> .wupd.entries = entries,
> };
> - struct xe_pt *pt = xe_vma_vm(vma)->pt_root[tile->id];
> + struct xe_pt *pt = vm->pt_root[tile->id];
> int ret;
>
> if (range) {
> /* Move this entire thing to xe_svm.c? */
> - xe_svm_notifier_lock(xe_vma_vm(vma));
> + xe_svm_notifier_lock(vm);
> if (!xe_svm_range_pages_valid(range)) {
> xe_svm_range_debug(range, "BIND PREPARE - RETRY");
> - xe_svm_notifier_unlock(xe_vma_vm(vma));
> + xe_svm_notifier_unlock(vm);
> return -EAGAIN;
> }
> if (xe_svm_range_has_dma_mapping(range)) {
> xe_res_first_dma(range->base.dma_addr, 0,
> range->base.itree.last + 1 - range->base.itree.start,
> &curs);
> - is_devmem = xe_res_is_vram(&curs);
> - if (is_devmem)
> - xe_svm_range_debug(range, "BIND PREPARE - DMA VRAM");
> - else
> - xe_svm_range_debug(range, "BIND PREPARE - DMA");
> + xe_svm_range_debug(range, "BIND PREPARE - MIXED");
> } else {
> xe_assert(xe, false);
> }
> @@ -674,54 +712,18 @@ xe_pt_stage_bind(struct xe_tile *tile, struct xe_vma *vma,
> * Note, when unlocking the resource cursor dma addresses may become
> * stale, but the bind will be aborted anyway at commit time.
> */
> - xe_svm_notifier_unlock(xe_vma_vm(vma));
> + xe_svm_notifier_unlock(vm);
> }
>
> - xe_walk.needs_64K = (xe_vma_vm(vma)->flags & XE_VM_FLAG_64K) && is_devmem;
> -
> - /**
> - * Default atomic expectations for different allocation scenarios are as follows:
> - *
> - * 1. Traditional API: When the VM is not in LR mode:
> - * - Device atomics are expected to function with all allocations.
> - *
> - * 2. Compute/SVM API: When the VM is in LR mode:
> - * - Device atomics are the default behavior when the bo is placed in a single region.
> - * - In all other cases device atomics will be disabled with AE=0 until an application
> - * request differently using a ioctl like madvise.
> - */
> + xe_walk.needs_64K = (vm->flags & XE_VM_FLAG_64K);
> if (vma->gpuva.flags & XE_VMA_ATOMIC_PTE_BIT) {
> - if (xe_vm_in_lr_mode(xe_vma_vm(vma))) {
> - if (bo && xe_bo_has_single_placement(bo))
> - xe_walk.default_pte |= XE_USM_PPGTT_PTE_AE;
> - /**
> - * If a SMEM+LMEM allocation is backed by SMEM, a device
> - * atomics will cause a gpu page fault and which then
> - * gets migrated to LMEM, bind such allocations with
> - * device atomics enabled.
> - */
> - else if (is_devmem)
> - xe_walk.default_pte |= XE_USM_PPGTT_PTE_AE;
> - } else {
> - xe_walk.default_pte |= XE_USM_PPGTT_PTE_AE;
> - }
> -
> - /**
> - * Unset AE if the platform(PVC) doesn't support it on an
> - * allocation
> - */
> - if (!xe->info.has_device_atomics_on_smem && !is_devmem)
> - xe_walk.default_pte &= ~XE_USM_PPGTT_PTE_AE;
> - }
> -
> - if (is_devmem) {
> - xe_walk.default_pte |= XE_PPGTT_PTE_DM;
> - xe_walk.dma_offset = bo ? vram_region_gpu_offset(bo->ttm.resource) : 0;
> + xe_walk.default_vram_pte = xe_atomic_for_vram(vm) ? XE_USM_PPGTT_PTE_AE : 0;
> + xe_walk.default_system_pte = xe_atomic_for_system(vm, bo) ?
> + XE_USM_PPGTT_PTE_AE : 0;
> }
>
> - if (!xe_vma_has_no_bo(vma) && xe_bo_is_stolen(bo))
> - xe_walk.dma_offset = xe_ttm_stolen_gpu_offset(xe_bo_device(bo));
> -
> + xe_walk.default_vram_pte |= XE_PPGTT_PTE_DM;
> + xe_walk.dma_offset = bo ? vram_region_gpu_offset(bo->ttm.resource) : 0;
LGTM
Reviewed-by: Himal Prasad Ghimiray <himal.prasad.ghimiray at intel.com>
> if (!range)
> xe_bo_assert_held(bo);
>
More information about the Intel-xe
mailing list