[PATCH v2 2/5] drm/xe: Strict migration policy for atomic SVM faults

Ghimiray, Himal Prasad himal.prasad.ghimiray at intel.com
Mon Apr 21 06:39:15 UTC 2025



On 17-04-2025 09:43, Matthew Brost wrote:
> Mixing GPU and CPU atomics does not work unless a strict migration
> policy of GPU atomics must be device memory. Enforce a policy of must be
> in VRAM with a retry loop of 2 attempts, if retry loop fails abort
> fault.
> 
> v2:
>   - Only retry migration on atomics
>   - Drop alway migrate modparam
> 
> Signed-off-by: Himal Prasad Ghimiray <himal.prasad.ghimiray at intel.com>
> Signed-off-by: Matthew Brost <matthew.brost at intel.com>
> ---
>   drivers/gpu/drm/xe/xe_module.c |  3 --
>   drivers/gpu/drm/xe/xe_module.h |  1 -
>   drivers/gpu/drm/xe/xe_svm.c    | 57 ++++++++++++++++++++++++++--------
>   drivers/gpu/drm/xe/xe_svm.h    |  5 ---
>   4 files changed, 44 insertions(+), 22 deletions(-)
> 
> diff --git a/drivers/gpu/drm/xe/xe_module.c b/drivers/gpu/drm/xe/xe_module.c
> index 05c7d0ae6d83..1c4dfafbcd0b 100644
> --- a/drivers/gpu/drm/xe/xe_module.c
> +++ b/drivers/gpu/drm/xe/xe_module.c
> @@ -33,9 +33,6 @@ struct xe_modparam xe_modparam = {
>   module_param_named(svm_notifier_size, xe_modparam.svm_notifier_size, uint, 0600);
>   MODULE_PARM_DESC(svm_notifier_size, "Set the svm notifier size(in MiB), must be power of 2");
>   
> -module_param_named(always_migrate_to_vram, xe_modparam.always_migrate_to_vram, bool, 0444);
> -MODULE_PARM_DESC(always_migrate_to_vram, "Always migrate to VRAM on GPU fault");
> -
>   module_param_named_unsafe(force_execlist, xe_modparam.force_execlist, bool, 0444);
>   MODULE_PARM_DESC(force_execlist, "Force Execlist submission");
>   
> diff --git a/drivers/gpu/drm/xe/xe_module.h b/drivers/gpu/drm/xe/xe_module.h
> index 84339e509c80..5a3bfea8b7b4 100644
> --- a/drivers/gpu/drm/xe/xe_module.h
> +++ b/drivers/gpu/drm/xe/xe_module.h
> @@ -12,7 +12,6 @@
>   struct xe_modparam {
>   	bool force_execlist;
>   	bool probe_display;
> -	bool always_migrate_to_vram;
>   	u32 force_vram_bar_size;
>   	int guc_log_level;
>   	char *guc_firmware_path;
> diff --git a/drivers/gpu/drm/xe/xe_svm.c b/drivers/gpu/drm/xe/xe_svm.c
> index 56b18a293bbc..1cc41ce7b684 100644
> --- a/drivers/gpu/drm/xe/xe_svm.c
> +++ b/drivers/gpu/drm/xe/xe_svm.c
> @@ -726,6 +726,35 @@ static int xe_svm_alloc_vram(struct xe_vm *vm, struct xe_tile *tile,
>   }
>   #endif
>   
> +static bool supports_4K_migration(struct xe_device *xe)
> +{
> +	if (xe->info.vram_flags & XE_VRAM_FLAGS_NEED64K)
> +		return false;
> +
> +	return true;
> +}
> +
> +static bool xe_svm_range_needs_migrate_to_vram(struct xe_svm_range *range,
> +					       struct xe_vma *vma)
> +{
> +	struct xe_vm *vm = range_to_vm(&range->base);
> +	u64 range_size = xe_svm_range_size(range);
> +
> +	if (!range->base.flags.migrate_devmem)
> +		return false;
> +
> +	if (xe_svm_range_in_vram(range)) {
> +		drm_dbg(&vm->xe->drm, "Range is already in VRAM\n");
> +		return false;
> +	}
> +
> +	if (range_size <= SZ_64K && !supports_4K_migration(vm->xe)) {
> +		drm_dbg(&vm->xe->drm, "Platform doesn't support SZ_4K range migration\n");
> +		return false;
> +	}
> +
> +	return true;
> +}
>   
>   /**
>    * xe_svm_handle_pagefault() - SVM handle page fault
> @@ -750,12 +779,14 @@ int xe_svm_handle_pagefault(struct xe_vm *vm, struct xe_vma *vma,
>   			IS_ENABLED(CONFIG_DRM_XE_DEVMEM_MIRROR),
>   		.check_pages_threshold = IS_DGFX(vm->xe) &&
>   			IS_ENABLED(CONFIG_DRM_XE_DEVMEM_MIRROR) ? SZ_64K : 0,
> +		.vram_only = atomic,

atomic && is_dgfx.
  >   	};
>   	struct xe_svm_range *range;
>   	struct drm_gpusvm_range *r;
>   	struct drm_exec exec;
>   	struct dma_fence *fence;
>   	struct xe_tile *tile = gt_to_tile(gt);
> +	int migrate_try_count = atomic ? 3 : 1;
>   	ktime_t end = 0;
>   	int err;
>   
> @@ -782,18 +813,21 @@ int xe_svm_handle_pagefault(struct xe_vm *vm, struct xe_vma *vma,
>   
>   	range_debug(range, "PAGE FAULT");
>   
> -	/* XXX: Add migration policy, for now migrate range once */
> -	if (!range->skip_migrate && range->base.flags.migrate_devmem &&
> -	    xe_svm_range_size(range) >= SZ_64K) {
> -		range->skip_migrate = true;
> -
> +	if (--migrate_try_count >= 0 &&
> +	    xe_svm_range_needs_migrate_to_vram(range, vma)) {
>   		err = xe_svm_alloc_vram(vm, tile, range, &ctx);
>   		if (err) {
> -			drm_dbg(&vm->xe->drm,
> -				"VRAM allocation failed, falling back to "
> -				"retrying fault, asid=%u, errno=%pe\n",
> -				vm->usm.asid, ERR_PTR(err));
> -			goto retry;
> +			if (migrate_try_count || !ctx.vram_only) {
> +				drm_dbg(&vm->xe->drm,
> +					"VRAM allocation failed, falling back to retrying fault, asid=%u, errno=%pe\n",
> +					vm->usm.asid, ERR_PTR(err));
> +				goto retry;
> +			} else {
> +				drm_err(&vm->xe->drm,
> +					"VRAM allocation failed, retry count exceeded, asid=%u, errno=%pe\n",
> +					vm->usm.asid, ERR_PTR(err));
> +				return err;
> +			}
>   		}
>   	}
>   
> @@ -843,9 +877,6 @@ int xe_svm_handle_pagefault(struct xe_vm *vm, struct xe_vma *vma,
>   	}
>   	drm_exec_fini(&exec);
>   
> -	if (xe_modparam.always_migrate_to_vram)
> -		range->skip_migrate = false;
> -
>   	dma_fence_wait(fence, false);
>   	dma_fence_put(fence);
>   
> diff --git a/drivers/gpu/drm/xe/xe_svm.h b/drivers/gpu/drm/xe/xe_svm.h
> index 3d441eb1f7ea..0e1f376a7471 100644
> --- a/drivers/gpu/drm/xe/xe_svm.h
> +++ b/drivers/gpu/drm/xe/xe_svm.h
> @@ -39,11 +39,6 @@ struct xe_svm_range {
>   	 * range. Protected by GPU SVM notifier lock.
>   	 */
>   	u8 tile_invalidated;
> -	/**
> -	 * @skip_migrate: Skip migration to VRAM, protected by GPU fault handler
> -	 * locking.
> -	 */
> -	u8 skip_migrate	:1;
>   };
>   
>   /**



More information about the Intel-xe mailing list