[Intel-xe] [PATCH v4] drm/xe: Extend scratch page support to fault mode
Matthew Brost
matthew.brost at intel.com
Thu Nov 16 12:47:19 UTC 2023
On Tue, Nov 14, 2023 at 04:56:11PM -0800, Brian Welty wrote:
> The PVC HW has a limitation that the page fault due to invalid access
> will halt the corresponding EUs. So, in order to activate the debugger,
> kmd needs to setup the scratch pages to unhalt the EUs.
>
> This feature can only be enabled if scratch flag is set per VM during
> VM_CREATE. For use with EU debugger, the debugger umd will set the flag.
>
> The idea is to bind a scratch vma if the page fault is from an invalid
> access. This patch is taking advantage of null pte. After the bind, the
> user application can continue to run without causing a GPU hang or reset.
>
> In case the app will bind this scratch vma to a valid address, GPUVA
> handles all of this (e.g. it will create ops to unbind the old VMA and
> bind the new one).
> Future optimization: as we use null pte, it may not require to invalidate
> TLBs when doing unbind if all upper layer page tables are unmodified.
>
> v2: per Matt's suggestion, remove the scratch page unbind.
> v3: correct error handlings.
> v4: add test for XE_VM_FLAG_64K, and ALIGN_DOWN the address
>
> Signed-off-by: Bruce Chang <yu.bruce.chang at intel.com>
> Reviewed-by: Brian Welty <brian.welty at intel.com>
> Acked-by: Stuart Summers <stuart.summers at intel.com>
> Signed-off-by: Brian Welty <brian.welty at intel.com>
Reviewed-by: Matthew Brost <matthew.brost at intel.com>
> ---
> drivers/gpu/drm/xe/xe_gt_pagefault.c | 9 ++++--
> drivers/gpu/drm/xe/xe_vm.c | 42 ++++++++++++++++++++++++----
> drivers/gpu/drm/xe/xe_vm.h | 2 ++
> 3 files changed, 46 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/gpu/drm/xe/xe_gt_pagefault.c b/drivers/gpu/drm/xe/xe_gt_pagefault.c
> index 5b3585e2c125..91ad843713a6 100644
> --- a/drivers/gpu/drm/xe/xe_gt_pagefault.c
> +++ b/drivers/gpu/drm/xe/xe_gt_pagefault.c
> @@ -161,8 +161,13 @@ static int handle_pagefault(struct xe_gt *gt, struct pagefault *pf)
> write_locked = true;
> vma = lookup_vma(vm, pf->page_addr);
> if (!vma) {
> - ret = -EINVAL;
> - goto unlock_vm;
> + if (vm->flags & XE_VM_FLAG_SCRATCH_PAGE)
> + vma = xe_vm_create_scratch_vma(vm, pf->page_addr);
> +
> + if (IS_ERR_OR_NULL(vma)) {
> + ret = -EINVAL;
> + goto unlock_vm;
> + }
> }
>
> if (!xe_vma_is_userptr(vma) || !xe_vma_userptr_check_repin(vma)) {
> diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c
> index d45f4f1d490f..191a1d2e9835 100644
> --- a/drivers/gpu/drm/xe/xe_vm.c
> +++ b/drivers/gpu/drm/xe/xe_vm.c
> @@ -1397,7 +1397,13 @@ struct xe_vm *xe_vm_create(struct xe_device *xe, u32 flags)
> }
> }
>
> - if (flags & XE_VM_FLAG_SCRATCH_PAGE) {
> + if (flags & XE_VM_FLAG_SCRATCH_PAGE &&
> + !(flags & XE_VM_FLAG_FAULT_MODE)) {
> + /*
> + * Create a global scratch page for mapping the whole VM's
> + * address space. However, in fault mode, we instead defer
> + * creation of scratch (null pte) to the fault handler.
> + */
> for_each_tile(tile, xe, id) {
> if (!vm->pt_root[id])
> continue;
> @@ -1951,10 +1957,6 @@ int xe_vm_create_ioctl(struct drm_device *dev, void *data,
> if (XE_IOCTL_DBG(xe, args->flags & ~ALL_DRM_XE_VM_CREATE_FLAGS))
> return -EINVAL;
>
> - if (XE_IOCTL_DBG(xe, args->flags & DRM_XE_VM_CREATE_SCRATCH_PAGE &&
> - args->flags & DRM_XE_VM_CREATE_FAULT_MODE))
> - return -EINVAL;
> -
> if (XE_IOCTL_DBG(xe, args->flags & DRM_XE_VM_CREATE_COMPUTE_MODE &&
> args->flags & DRM_XE_VM_CREATE_FAULT_MODE))
> return -EINVAL;
> @@ -3269,6 +3271,36 @@ int xe_vm_invalidate_vma(struct xe_vma *vma)
> return 0;
> }
>
> +struct xe_vma *xe_vm_create_scratch_vma(struct xe_vm *vm, u64 addr)
> +{
> + struct xe_vma *vma;
> + u64 end;
> + int err;
> +
> + if (xe_vm_is_closed_or_banned(vm))
> + return ERR_PTR(-ENOENT);
> +
> + if (vm->flags & XE_VM_FLAG_64K) {
> + addr = ALIGN_DOWN(addr, SZ_64K);
> + end = addr + SZ_64K - 1;
> + } else {
> + end = addr + PAGE_SIZE - 1;
> + }
> + vma = xe_vma_create(vm, NULL, 0, addr, end, false, true, 0);
> + if (IS_ERR_OR_NULL(vma))
> + return vma;
> +
> + err = xe_vm_insert_vma(vm, vma);
> + if (err) {
> + xe_vma_destroy_late(vma);
> + return ERR_PTR(err);
> + }
> +
> + /* fault will handle the bind */
> +
> + return vma;
> +}
> +
> int xe_analyze_vm(struct drm_printer *p, struct xe_vm *vm, int gt_id)
> {
> struct drm_gpuva *gpuva;
> diff --git a/drivers/gpu/drm/xe/xe_vm.h b/drivers/gpu/drm/xe/xe_vm.h
> index b08c75fbd8a1..e07ced37223d 100644
> --- a/drivers/gpu/drm/xe/xe_vm.h
> +++ b/drivers/gpu/drm/xe/xe_vm.h
> @@ -180,6 +180,8 @@ struct dma_fence *xe_vm_rebind(struct xe_vm *vm, bool rebind_worker);
>
> int xe_vm_invalidate_vma(struct xe_vma *vma);
>
> +struct xe_vma *xe_vm_create_scratch_vma(struct xe_vm *vm, u64 addr);
> +
> extern struct ttm_device_funcs xe_ttm_funcs;
>
> struct ttm_buffer_object *xe_vm_ttm_bo(struct xe_vm *vm);
> --
> 2.38.0
>
More information about the Intel-xe
mailing list