[PATCH 2/2] drm/xe: Userptr invalidation race with binds fixes
Thomas Hellström
thomas.hellstrom at linux.intel.com
Fri Feb 21 11:34:12 UTC 2025
On Thu, 2025-02-20 at 15:45 -0800, Matthew Brost wrote:
> Squash bind operation after userptr invalidation into a clearing of
> PTEs. Will prevent valid GPU page tables from pointing to stale CPU
> pages.
>
> Fixup initial bind handling always add VMAs to invalidation list and
> clear PTEs.
>
> Remove unused rebind variable in xe_pt.
>
> Always hold notifier across TLB invalidation in notifier to prevent a
> UAF if an unbind races.
>
> Including all of the above changes for Fixes patch in hopes of an
> easier
> backport which fix a single patch.
>
> Cc: Thomas Hellström <thomas.hellstrom at linux.intel.com>
> Cc: <stable at vger.kernel.org>
> Fixes: e8babb280b5e: ("drm/xe: Convert multiple bind ops into single
> job")
> Signed-off-by: Matthew Brost <matthew.brost at intel.com>
> ---
> drivers/gpu/drm/xe/regs/xe_gtt_defs.h | 1 +
> drivers/gpu/drm/xe/xe_pt.c | 86 +++++++++++++++++++------
> --
> drivers/gpu/drm/xe/xe_pt_types.h | 1 -
> drivers/gpu/drm/xe/xe_vm.c | 4 +-
> 4 files changed, 64 insertions(+), 28 deletions(-)
>
> diff --git a/drivers/gpu/drm/xe/regs/xe_gtt_defs.h
> b/drivers/gpu/drm/xe/regs/xe_gtt_defs.h
> index 4389e5a76f89..ab281e721d94 100644
> --- a/drivers/gpu/drm/xe/regs/xe_gtt_defs.h
> +++ b/drivers/gpu/drm/xe/regs/xe_gtt_defs.h
> @@ -13,6 +13,7 @@
>
> #define GUC_GGTT_TOP 0xFEE00000
>
> +#define XE_PPGTT_IS_LEAF BIT_ULL(63)
> #define XELPG_PPGTT_PTE_PAT3 BIT_ULL(62)
> #define XE2_PPGTT_PTE_PAT4 BIT_ULL(61)
> #define XE_PPGTT_PDE_PDPE_PAT2 BIT_ULL(12)
> diff --git a/drivers/gpu/drm/xe/xe_pt.c b/drivers/gpu/drm/xe/xe_pt.c
> index 1ddcc7e79a93..26b513a54113 100644
> --- a/drivers/gpu/drm/xe/xe_pt.c
> +++ b/drivers/gpu/drm/xe/xe_pt.c
> @@ -389,6 +389,8 @@ xe_pt_insert_entry(struct xe_pt_stage_bind_walk
> *xe_walk, struct xe_pt *parent,
> idx = offset - entry->ofs;
> entry->pt_entries[idx].pt = xe_child;
> entry->pt_entries[idx].pte = pte;
> + if (!xe_child)
> + entry->pt_entries[idx].pte |=
> XE_PPGTT_IS_LEAF;
I don't think this is needed.
> entry->qwords++;
> }
>
> @@ -792,6 +794,24 @@ static const struct xe_pt_walk_ops
> xe_pt_zap_ptes_ops = {
> .pt_entry = xe_pt_zap_ptes_entry,
> };
>
> +static bool __xe_pt_zap_ptes(struct xe_tile *tile, struct xe_vma
> *vma)
> +{
> + struct xe_pt_zap_ptes_walk xe_walk = {
> + .base = {
> + .ops = &xe_pt_zap_ptes_ops,
> + .shifts = xe_normal_pt_shifts,
> + .max_level = XE_PT_HIGHEST_LEVEL,
> + },
> + .tile = tile,
> + };
> + struct xe_pt *pt = xe_vma_vm(vma)->pt_root[tile->id];
> +
> + (void)xe_pt_walk_shared(&pt->base, pt->level,
> xe_vma_start(vma),
> + xe_vma_end(vma), &xe_walk.base);
> +
> + return xe_walk.needs_invalidate;
> +}
> +
> /**
> * xe_pt_zap_ptes() - Zap (zero) gpu ptes of an address range
> * @tile: The tile we're zapping for.
> @@ -810,24 +830,12 @@ static const struct xe_pt_walk_ops
> xe_pt_zap_ptes_ops = {
> */
> bool xe_pt_zap_ptes(struct xe_tile *tile, struct xe_vma *vma)
> {
> - struct xe_pt_zap_ptes_walk xe_walk = {
> - .base = {
> - .ops = &xe_pt_zap_ptes_ops,
> - .shifts = xe_normal_pt_shifts,
> - .max_level = XE_PT_HIGHEST_LEVEL,
> - },
> - .tile = tile,
> - };
> - struct xe_pt *pt = xe_vma_vm(vma)->pt_root[tile->id];
> u8 pt_mask = (vma->tile_present & ~vma->tile_invalidated);
>
> if (!(pt_mask & BIT(tile->id)))
> return false;
>
> - (void)xe_pt_walk_shared(&pt->base, pt->level,
> xe_vma_start(vma),
> - xe_vma_end(vma), &xe_walk.base);
> -
> - return xe_walk.needs_invalidate;
> + return __xe_pt_zap_ptes(tile, vma);
> }
>
> static void
> @@ -843,9 +851,10 @@ xe_vm_populate_pgtable(struct
> xe_migrate_pt_update *pt_update, struct xe_tile *t
> for (i = 0; i < num_qwords; i++) {
> if (map)
> xe_map_wr(tile_to_xe(tile), map, (qword_ofs
> + i) *
> - sizeof(u64), u64, ptes[i].pte);
> + sizeof(u64), u64, ptes[i].pte &
> + ~XE_PPGTT_IS_LEAF);
> else
> - ptr[i] = ptes[i].pte;
> + ptr[i] = ptes[i].pte & ~XE_PPGTT_IS_LEAF;
> }
> }
>
> @@ -1201,7 +1210,30 @@ static bool xe_pt_userptr_inject_eagain(struct
> xe_userptr_vma *uvma)
>
> #endif
>
> -static int vma_check_userptr(struct xe_vm *vm, struct xe_vma *vma,
> +static void
> +vma_convert_to_invalidation(struct xe_tile *tile, struct xe_vma
> *vma,
> + struct xe_vm_pgtable_update_ops
> *pt_update)
> +{
> + int i, j, k;
> +
> + for (i = 0; i < pt_update->current_op; ++i) {
> + struct xe_vm_pgtable_update_op *op = &pt_update-
> >ops[i];
> +
> + if (vma == op->vma && (op->bind || op->rebind)) {
> + for (j = 0; j < op->num_entries; ++j) {
> + for (k = 0; k < op-
> >entries[j].qwords; ++k)
> + if (op-
> >entries[j].pt_entries[k].pte &
> + XE_PPGTT_IS_LEAF)
IIRC It's a leaf iff op->entries[j].pt->level == 0 ||
xe_pte_is_huge(pte) (xe_pte_is_huge(pte) needs to be coded)
> + op-
> >entries[j].pt_entries[k].pte = 0;
This should be a scratch pte unless it's a faulting VM?
> + }
> + }
> + }
> +
> + __xe_pt_zap_ptes(tile, vma);
Umm, A zap here would race completely with other pipelined updates
right? Why is it needed?
> +}
> +
> +static int vma_check_userptr(struct xe_tile *tile, struct xe_vm *vm,
> + struct xe_vma *vma,
> struct xe_vm_pgtable_update_ops
> *pt_update)
> {
> struct xe_userptr_vma *uvma;
> @@ -1215,9 +1247,6 @@ static int vma_check_userptr(struct xe_vm *vm,
> struct xe_vma *vma,
> uvma = to_userptr_vma(vma);
> notifier_seq = uvma->userptr.notifier_seq;
>
> - if (uvma->userptr.initial_bind && !xe_vm_in_fault_mode(vm))
> - return 0;
> -
> if (!mmu_interval_read_retry(&uvma->userptr.notifier,
> notifier_seq) &&
> !xe_pt_userptr_inject_eagain(uvma))
> @@ -1231,6 +1260,8 @@ static int vma_check_userptr(struct xe_vm *vm,
> struct xe_vma *vma,
> &vm->userptr.invalidated);
> spin_unlock(&vm->userptr.invalidated_lock);
>
> + vma_convert_to_invalidation(tile, vma, pt_update);
> +
> if (xe_vm_in_preempt_fence_mode(vm)) {
> struct dma_resv_iter cursor;
> struct dma_fence *fence;
And I really think here we should remove the open-coded and
undocumented invalidation (It took me like 15 minutes to understand
what the code did, and then I'm already up-to-date with how
invalidation works).
If just returning -EAGAIN doesn't really work here for the preempt-
fence case, Then I suggest that the error injection intstead triggers a
real invalidation outside of the notifier lock.
Perhaps using mmu_notifier_invalidate_range[start|end]()?
But for now since this is a fix, restrict the -EINVAL inject to cases
where it works and that we want to test.
Thanks,
/Thomas
> @@ -1252,7 +1283,8 @@ static int vma_check_userptr(struct xe_vm *vm,
> struct xe_vma *vma,
> return 0;
> }
>
> -static int op_check_userptr(struct xe_vm *vm, struct xe_vma_op *op,
> +static int op_check_userptr(struct xe_tile *tile, struct xe_vm *vm,
> + struct xe_vma_op *op,
> struct xe_vm_pgtable_update_ops
> *pt_update)
> {
> int err = 0;
> @@ -1264,18 +1296,21 @@ static int op_check_userptr(struct xe_vm *vm,
> struct xe_vma_op *op,
> if (!op->map.immediate && xe_vm_in_fault_mode(vm))
> break;
>
> - err = vma_check_userptr(vm, op->map.vma, pt_update);
> + err = vma_check_userptr(tile, vm, op->map.vma,
> pt_update);
> break;
> case DRM_GPUVA_OP_REMAP:
> if (op->remap.prev)
> - err = vma_check_userptr(vm, op->remap.prev,
> pt_update);
> + err = vma_check_userptr(tile, vm, op-
> >remap.prev,
> + pt_update);
> if (!err && op->remap.next)
> - err = vma_check_userptr(vm, op->remap.next,
> pt_update);
> + err = vma_check_userptr(tile, vm, op-
> >remap.next,
> + pt_update);
> break;
> case DRM_GPUVA_OP_UNMAP:
> break;
> case DRM_GPUVA_OP_PREFETCH:
> - err = vma_check_userptr(vm, gpuva_to_vma(op-
> >base.prefetch.va),
> + err = vma_check_userptr(tile, vm,
> + gpuva_to_vma(op-
> >base.prefetch.va),
> pt_update);
> break;
> default:
> @@ -1301,7 +1336,8 @@ static int xe_pt_userptr_pre_commit(struct
> xe_migrate_pt_update *pt_update)
> down_read(&vm->userptr.notifier_lock);
>
> list_for_each_entry(op, &vops->list, link) {
> - err = op_check_userptr(vm, op, pt_update_ops);
> + err = op_check_userptr(&vm->xe->tiles[pt_update-
> >tile_id],
> + vm, op, pt_update_ops);
> if (err) {
> up_read(&vm->userptr.notifier_lock);
> break;
> diff --git a/drivers/gpu/drm/xe/xe_pt_types.h
> b/drivers/gpu/drm/xe/xe_pt_types.h
> index 384cc04de719..0f9b7650cd6f 100644
> --- a/drivers/gpu/drm/xe/xe_pt_types.h
> +++ b/drivers/gpu/drm/xe/xe_pt_types.h
> @@ -29,7 +29,6 @@ struct xe_pt {
> struct xe_bo *bo;
> unsigned int level;
> unsigned int num_live;
> - bool rebind;
> bool is_compact;
> #if IS_ENABLED(CONFIG_DRM_XE_DEBUG_VM)
> /** addr: Virtual address start address of the PT. */
> diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c
> index ea2e287e6526..f90e5c92010c 100644
> --- a/drivers/gpu/drm/xe/xe_vm.c
> +++ b/drivers/gpu/drm/xe/xe_vm.c
> @@ -623,8 +623,6 @@ static bool vma_userptr_invalidate(struct
> mmu_interval_notifier *mni,
> spin_unlock(&vm->userptr.invalidated_lock);
> }
>
> - up_write(&vm->userptr.notifier_lock);
> -
> /*
> * Preempt fences turn into schedule disables, pipeline
> these.
> * Note that even in fault mode, we need to wait for binds
> and
> @@ -647,6 +645,8 @@ static bool vma_userptr_invalidate(struct
> mmu_interval_notifier *mni,
> XE_WARN_ON(err);
> }
>
> + up_write(&vm->userptr.notifier_lock);
> +
> trace_xe_vma_userptr_invalidate_complete(vma);
>
> return true;
More information about the Intel-xe
mailing list