[PATCH] drm/xe/vf: Fix VM crash during VF driver release
Michal Wajdeczko
michal.wajdeczko at intel.com
Sun Jul 27 17:33:53 UTC 2025
On 7/26/2025 3:39 AM, Satyanarayana K V P wrote:
> The VF CCS save/restore series (patchwork #149108) has a dependency
> on the migration framework. A recent migration update in commit
> d65ff1ec8535 ("drm/xe: Split xe_migrate allocation from initialization")
> caused a VM crash during XE driver release for iGPU devices.
you may want to include crash log here for reference
>
> Update the VF CCS migration initialization sequence to align with the new
> migration framework changes, resolving the release-time crash.
>
> Fixes: f3009272ff2e ("drm/xe/vf: Create contexts for CCS read write")
>
no empty lines here
> Signed-off-by: Satyanarayana K V P <satyanarayana.k.v.p at intel.com>
> Cc: Michal Wajdeczko <michal.wajdeczko at intel.com>
> Cc: Matthew Brost <matthew.brost at intel.com>
> Cc: Matthew Auld <matthew.auld at intel.com>
> Cc: Piotr Piórkowski <piotr.piorkowski at intel.com>
> ---
> drivers/gpu/drm/xe/xe_gt.c | 2 +-
> drivers/gpu/drm/xe/xe_migrate.c | 5 ++---
> drivers/gpu/drm/xe/xe_migrate.h | 2 +-
> drivers/gpu/drm/xe/xe_sriov_vf_ccs.c | 9 ++++++++-
> 4 files changed, 12 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/gpu/drm/xe/xe_gt.c b/drivers/gpu/drm/xe/xe_gt.c
> index c8eda36546d3..1ef6eea7df19 100644
> --- a/drivers/gpu/drm/xe/xe_gt.c
> +++ b/drivers/gpu/drm/xe/xe_gt.c
> @@ -564,7 +564,7 @@ static int gt_init_with_all_forcewake(struct xe_gt *gt)
> if (xe_gt_is_main_type(gt)) {
> struct xe_tile *tile = gt_to_tile(gt);
>
> - tile->migrate = xe_migrate_init(tile);
> + tile->migrate = xe_migrate_init(tile, tile->migrate);
> if (IS_ERR(tile->migrate)) {
> err = PTR_ERR(tile->migrate);
> goto err_force_wake;
> diff --git a/drivers/gpu/drm/xe/xe_migrate.c b/drivers/gpu/drm/xe/xe_migrate.c
> index 90065d7d29ff..ae03440a276c 100644
> --- a/drivers/gpu/drm/xe/xe_migrate.c
> +++ b/drivers/gpu/drm/xe/xe_migrate.c
> @@ -400,11 +400,10 @@ struct xe_migrate *xe_migrate_alloc(struct xe_tile *tile)
> *
> * Return: Pointer to a migrate context on success. Error pointer on error.
> */
> -struct xe_migrate *xe_migrate_init(struct xe_tile *tile)
> +struct xe_migrate *xe_migrate_init(struct xe_tile *tile, struct xe_migrate *m)
hmm, why are we passing 'tile' here? it should be just 'm' as 'tile' can be
accessed from m->tile since it was initialized in xe_migrate_alloc()
and then we can simply return err code, not play with ERR_PTR
> {
> - struct xe_device *xe = tile_to_xe(tile);
> struct xe_gt *primary_gt = tile->primary_gt;
> - struct xe_migrate *m = tile->migrate;
> + struct xe_device *xe = tile_to_xe(tile);
> struct xe_vm *vm;
> int err;
>
> diff --git a/drivers/gpu/drm/xe/xe_migrate.h b/drivers/gpu/drm/xe/xe_migrate.h
> index 3758f9615484..a5dfb34fc072 100644
> --- a/drivers/gpu/drm/xe/xe_migrate.h
> +++ b/drivers/gpu/drm/xe/xe_migrate.h
> @@ -105,7 +105,7 @@ struct xe_migrate_pt_update {
> };
>
> struct xe_migrate *xe_migrate_alloc(struct xe_tile *tile);
> -struct xe_migrate *xe_migrate_init(struct xe_tile *tile);
> +struct xe_migrate *xe_migrate_init(struct xe_tile *tile, struct xe_migrate *m);
>
> struct dma_fence *xe_migrate_to_vram(struct xe_migrate *m,
> unsigned long npages,
> diff --git a/drivers/gpu/drm/xe/xe_sriov_vf_ccs.c b/drivers/gpu/drm/xe/xe_sriov_vf_ccs.c
> index af43e04179aa..1752864f7ffe 100644
> --- a/drivers/gpu/drm/xe/xe_sriov_vf_ccs.c
> +++ b/drivers/gpu/drm/xe/xe_sriov_vf_ccs.c
> @@ -270,11 +270,18 @@ int xe_sriov_vf_ccs_init(struct xe_device *xe)
> ctx = &tile->sriov.vf.ccs[ctx_id];
> ctx->ctx_id = ctx_id;
>
> - migrate = xe_migrate_init(tile);
> + migrate = xe_migrate_alloc(tile);
> if (IS_ERR(migrate)) {
> err = PTR_ERR(migrate);
> goto err_ret;
> }
> +
> + migrate = xe_migrate_init(tile, migrate);
> + if (IS_ERR(migrate)) {
> + err = PTR_ERR(migrate);
> + goto err_ret;
> + }
> +
> ctx->migrate = migrate;
>
> err = alloc_bb_pool(tile, ctx);
More information about the Intel-xe
mailing list