[PATCH v8 1/4] drm/xe/vf: Divide GGTT ballooning into allocation and insertion

Thu Apr 10 16:54:27 UTC 2025

On 09.04.2025 23:13, Tomasz Lis wrote:
> The balloon nodes, which are used to fill areas of GGTT inaccessible
> for a specific VF, were allocated and inserted into GGTT within one
> function. To be able to re-use that insertion code during VF
> migration recovery, we need to split it.
> 
> This patch separates allocation (init/fini functs) from the insertion
> of balloons (balloon/deballoon functs). Locks are also moved to ensure
> calls from post-migration recovery worker will not cause a deadlock.
> 
> v2: Moved declarations to proper header
> v3: Rephrased description, introduced "_locked" versions of some
>   functs, more lockdep checks, some functions renamed, altered error
>   handling, added missing kerneldocs.
> v4: Suffixed more functs with `_locked`, moved lockdep asserts,
>   fixed finalization in error path, added asserts
> 
> Signed-off-by: Tomasz Lis <tomasz.lis at intel.com>
> Cc: Michal Wajdeczko <michal.wajdeczko at intel.com>
> ---
>  drivers/gpu/drm/xe/xe_ggtt.c        |  34 ++++-----
>  drivers/gpu/drm/xe/xe_ggtt.h        |   6 +-
>  drivers/gpu/drm/xe/xe_gt_sriov_vf.c | 109 +++++++++++++++++++++-------
>  drivers/gpu/drm/xe/xe_gt_sriov_vf.h |   2 +
>  4 files changed, 103 insertions(+), 48 deletions(-)
> 
> diff --git a/drivers/gpu/drm/xe/xe_ggtt.c b/drivers/gpu/drm/xe/xe_ggtt.c
> index 7062115909f2..e7d474bd0cfe 100644
> --- a/drivers/gpu/drm/xe/xe_ggtt.c
> +++ b/drivers/gpu/drm/xe/xe_ggtt.c
> @@ -1,4 +1,4 @@
> -// SPDX-License-Identifier: MIT
> +/// SPDX-License-Identifier: MIT

what's this?

>  /*
>   * Copyright © 2021 Intel Corporation
>   */
> @@ -429,16 +429,17 @@ static void xe_ggtt_dump_node(struct xe_ggtt *ggtt,
>  }
>  
>  /**
> - * xe_ggtt_node_insert_balloon - prevent allocation of specified GGTT addresses
> + * xe_ggtt_node_insert_balloon_locked - prevent allocation of specified GGTT addresses
>   * @node: the &xe_ggtt_node to hold reserved GGTT node
>   * @start: the starting GGTT address of the reserved region
>   * @end: then end GGTT address of the reserved region
>   *
> - * Use xe_ggtt_node_remove_balloon() to release a reserved GGTT node.
> + * To be used in cases where ggtt->lock is already taken.
> + * Use xe_ggtt_node_remove_balloon_locked() to release a reserved GGTT node.
>   *
>   * Return: 0 on success or a negative error code on failure.
>   */
> -int xe_ggtt_node_insert_balloon(struct xe_ggtt_node *node, u64 start, u64 end)
> +int xe_ggtt_node_insert_balloon_locked(struct xe_ggtt_node *node, u64 start, u64 end)
>  {
>  	struct xe_ggtt *ggtt = node->ggtt;
>  	int err;
> @@ -447,14 +448,13 @@ int xe_ggtt_node_insert_balloon(struct xe_ggtt_node *node, u64 start, u64 end)
>  	xe_tile_assert(ggtt->tile, IS_ALIGNED(start, XE_PAGE_SIZE));
>  	xe_tile_assert(ggtt->tile, IS_ALIGNED(end, XE_PAGE_SIZE));
>  	xe_tile_assert(ggtt->tile, !drm_mm_node_allocated(&node->base));
> +	lockdep_assert_held(&ggtt->lock);
>  
>  	node->base.color = 0;
>  	node->base.start = start;
>  	node->base.size = end - start;
>  
> -	mutex_lock(&ggtt->lock);
>  	err = drm_mm_reserve_node(&ggtt->mm, &node->base);
> -	mutex_unlock(&ggtt->lock);
>  
>  	if (xe_gt_WARN(ggtt->tile->primary_gt, err,
>  		       "Failed to balloon GGTT %#llx-%#llx (%pe)\n",
> @@ -466,27 +466,25 @@ int xe_ggtt_node_insert_balloon(struct xe_ggtt_node *node, u64 start, u64 end)
>  }
>  
>  /**
> - * xe_ggtt_node_remove_balloon - release a reserved GGTT region
> + * xe_ggtt_node_remove_balloon_locked - release a reserved GGTT region
>   * @node: the &xe_ggtt_node with reserved GGTT region
>   *
> - * See xe_ggtt_node_insert_balloon() for details.
> + * To be used in cases where ggtt->lock is already taken.
> + * See xe_ggtt_node_insert_balloon_locked() for details.
>   */
> -void xe_ggtt_node_remove_balloon(struct xe_ggtt_node *node)
> +void xe_ggtt_node_remove_balloon_locked(struct xe_ggtt_node *node)
>  {
> +	lockdep_assert_held(&node->ggtt->lock);
> +
>  	if (!node || !node->ggtt)
>  		return;
>  
>  	if (!drm_mm_node_allocated(&node->base))

nit: we should use xe_ggtt_node_allocated() instead

> -		goto free_node;
> +		return;
>  
>  	xe_ggtt_dump_node(node->ggtt, &node->base, "remove-balloon");
>  
> -	mutex_lock(&node->ggtt->lock);
>  	drm_mm_remove_node(&node->base);
> -	mutex_unlock(&node->ggtt->lock);
> -
> -free_node:
> -	xe_ggtt_node_fini(node);
>  }
>  
>  /**
> @@ -539,10 +537,10 @@ int xe_ggtt_node_insert(struct xe_ggtt_node *node, u32 size, u32 align)
>   *
>   * This function will allocated the struct %xe_ggtt_node and return it's pointer.

since you're around, can you fix above s/allocated/allocate

>   * This struct will then be freed after the node removal upon xe_ggtt_node_remove()
> - * or xe_ggtt_node_remove_balloon().
> + * or xe_ggtt_node_remove_balloon_locked().
>   * Having %xe_ggtt_node struct allocated doesn't mean that the node is already allocated
>   * in GGTT. Only the xe_ggtt_node_insert(), xe_ggtt_node_insert_locked(),
> - * xe_ggtt_node_insert_balloon() will ensure the node is inserted or reserved in GGTT.
> + * xe_ggtt_node_insert_balloon_locked() will ensure the node is inserted or reserved in GGTT.
>   *
>   * Return: A pointer to %xe_ggtt_node struct on success. An ERR_PTR otherwise.
>   **/
> @@ -564,7 +562,7 @@ struct xe_ggtt_node *xe_ggtt_node_init(struct xe_ggtt *ggtt)
>   * @node: the &xe_ggtt_node to be freed
>   *
>   * If anything went wrong with either xe_ggtt_node_insert(), xe_ggtt_node_insert_locked(),
> - * or xe_ggtt_node_insert_balloon(); and this @node is not going to be reused, then,
> + * or xe_ggtt_node_insert_balloon_locked(); and this @node is not going to be reused, then,
>   * this function needs to be called to free the %xe_ggtt_node struct
>   **/
>  void xe_ggtt_node_fini(struct xe_ggtt_node *node)
> diff --git a/drivers/gpu/drm/xe/xe_ggtt.h b/drivers/gpu/drm/xe/xe_ggtt.h
> index 27e7d67de004..d468af96b465 100644
> --- a/drivers/gpu/drm/xe/xe_ggtt.h
> +++ b/drivers/gpu/drm/xe/xe_ggtt.h
> @@ -15,9 +15,9 @@ int xe_ggtt_init(struct xe_ggtt *ggtt);
>  
>  struct xe_ggtt_node *xe_ggtt_node_init(struct xe_ggtt *ggtt);
>  void xe_ggtt_node_fini(struct xe_ggtt_node *node);
> -int xe_ggtt_node_insert_balloon(struct xe_ggtt_node *node,
> -				u64 start, u64 size);
> -void xe_ggtt_node_remove_balloon(struct xe_ggtt_node *node);
> +int xe_ggtt_node_insert_balloon_locked(struct xe_ggtt_node *node,
> +				       u64 start, u64 size);
> +void xe_ggtt_node_remove_balloon_locked(struct xe_ggtt_node *node);
>  
>  int xe_ggtt_node_insert(struct xe_ggtt_node *node, u32 size, u32 align);
>  int xe_ggtt_node_insert_locked(struct xe_ggtt_node *node,
> diff --git a/drivers/gpu/drm/xe/xe_gt_sriov_vf.c b/drivers/gpu/drm/xe/xe_gt_sriov_vf.c
> index a439261bf4d7..d2f6bfb3aacf 100644
> --- a/drivers/gpu/drm/xe/xe_gt_sriov_vf.c
> +++ b/drivers/gpu/drm/xe/xe_gt_sriov_vf.c
> @@ -560,35 +560,40 @@ u64 xe_gt_sriov_vf_lmem(struct xe_gt *gt)
>  	return gt->sriov.vf.self_config.lmem_size;
>  }
>  
> -static struct xe_ggtt_node *
> -vf_balloon_ggtt_node(struct xe_ggtt *ggtt, u64 start, u64 end)
> +static int vf_init_ggtt_balloons(struct xe_gt *gt)
>  {
> -	struct xe_ggtt_node *node;
> -	int err;
> +	struct xe_tile *tile = gt_to_tile(gt);
> +	struct xe_ggtt *ggtt = tile->mem.ggtt;
>  
> -	node = xe_ggtt_node_init(ggtt);
> -	if (IS_ERR(node))
> -		return node;
> +	tile->sriov.vf.ggtt_balloon[0] = xe_ggtt_node_init(ggtt);
> +	if (IS_ERR(tile->sriov.vf.ggtt_balloon[0]))
> +		return PTR_ERR(tile->sriov.vf.ggtt_balloon[0]);

as mentioned earlier in similar case, since you are accessing vf
specific fields we should have asserts in this function:

  	xe_gt_assert(gt, IS_SRIOV_VF(xe));
  	xe_gt_assert(gt, !xe_gt_is_media_type(gt));

>  
> -	err = xe_ggtt_node_insert_balloon(node, start, end);
> -	if (err) {
> -		xe_ggtt_node_fini(node);
> -		return ERR_PTR(err);
> +	tile->sriov.vf.ggtt_balloon[1] = xe_ggtt_node_init(ggtt);
> +	if (IS_ERR(tile->sriov.vf.ggtt_balloon[1])) {
> +		xe_ggtt_node_fini(tile->sriov.vf.ggtt_balloon[0]);
> +		return PTR_ERR(tile->sriov.vf.ggtt_balloon[1]);
>  	}
>  
> -	return node;
> +	return 0;
>  }
>  
> -static int vf_balloon_ggtt(struct xe_gt *gt)
> +/**
> + * xe_gt_sriov_vf_balloon_ggtt_locked - Insert balloon nodes to limit used GGTT address range.
> + * @gt: the &xe_gt struct instance
> + * Return: 0 on success or a negative error code on failure.
> + */
> +int xe_gt_sriov_vf_balloon_ggtt_locked(struct xe_gt *gt)
>  {
>  	struct xe_gt_sriov_vf_selfconfig *config = &gt->sriov.vf.self_config;
>  	struct xe_tile *tile = gt_to_tile(gt);
> -	struct xe_ggtt *ggtt = tile->mem.ggtt;
>  	struct xe_device *xe = gt_to_xe(gt);
>  	u64 start, end;
> +	int err;
>  
>  	xe_gt_assert(gt, IS_SRIOV_VF(xe));
>  	xe_gt_assert(gt, !xe_gt_is_media_type(gt));
> +	lockdep_assert_held(&tile->mem.ggtt->lock);
>  
>  	if (!config->ggtt_size)
>  		return -ENODATA;
> @@ -611,31 +616,75 @@ static int vf_balloon_ggtt(struct xe_gt *gt)
>  	start = xe_wopcm_size(xe);
>  	end = config->ggtt_base;
>  	if (end != start) {
> -		tile->sriov.vf.ggtt_balloon[0] = vf_balloon_ggtt_node(ggtt, start, end);
> -		if (IS_ERR(tile->sriov.vf.ggtt_balloon[0]))
> -			return PTR_ERR(tile->sriov.vf.ggtt_balloon[0]);
> +		err = xe_ggtt_node_insert_balloon_locked(tile->sriov.vf.ggtt_balloon[0], start, end);
> +		if (err)
> +			return err;
>  	}
>  
>  	start = config->ggtt_base + config->ggtt_size;
>  	end = GUC_GGTT_TOP;
>  	if (end != start) {
> -		tile->sriov.vf.ggtt_balloon[1] = vf_balloon_ggtt_node(ggtt, start, end);
> -		if (IS_ERR(tile->sriov.vf.ggtt_balloon[1])) {
> -			xe_ggtt_node_remove_balloon(tile->sriov.vf.ggtt_balloon[0]);
> -			return PTR_ERR(tile->sriov.vf.ggtt_balloon[1]);
> +		err = xe_ggtt_node_insert_balloon_locked(tile->sriov.vf.ggtt_balloon[1], start, end);
> +		if (err) {
> +			xe_ggtt_node_remove_balloon_locked(tile->sriov.vf.ggtt_balloon[0]);
> +			return err;
>  		}
>  	}
>  
>  	return 0;
>  }
>  
> -static void deballoon_ggtt(struct drm_device *drm, void *arg)
> +static int vf_balloon_ggtt(struct xe_gt *gt)
>  {
> -	struct xe_tile *tile = arg;
> +	struct xe_ggtt *ggtt = gt_to_tile(gt)->mem.ggtt;
> +	int err;
> +
> +	mutex_lock(&ggtt->lock);
> +	err = xe_gt_sriov_vf_balloon_ggtt_locked(gt);
> +	mutex_unlock(&ggtt->lock);
> +
> +	return err;
> +}
> +
> +/**
> + * xe_gt_sriov_vf_deballoon_ggtt_locked - Remove balloon nodes which limited used address renge.

typo

hmm, but I'm not sure we need "which ..." part anyway

> + * @gt: the &xe_gt struct instance
> + */
> +void xe_gt_sriov_vf_deballoon_ggtt_locked(struct xe_gt *gt)
> +{
> +	struct xe_tile *tile = gt_to_tile(gt);
>  
>  	xe_tile_assert(tile, IS_SRIOV_VF(tile_to_xe(tile)));
> -	xe_ggtt_node_remove_balloon(tile->sriov.vf.ggtt_balloon[1]);
> -	xe_ggtt_node_remove_balloon(tile->sriov.vf.ggtt_balloon[0]);
> +	xe_ggtt_node_remove_balloon_locked(tile->sriov.vf.ggtt_balloon[1]);
> +	xe_ggtt_node_remove_balloon_locked(tile->sriov.vf.ggtt_balloon[0]);
> +}
> +
> +static void vf_deballoon_ggtt(struct xe_gt *gt)
> +{
> +	struct xe_tile *tile = gt_to_tile(gt);
> +
> +	mutex_lock(&tile->mem.ggtt->lock);
> +	xe_gt_sriov_vf_deballoon_ggtt_locked(gt);
> +	mutex_unlock(&tile->mem.ggtt->lock);
> +}
> +
> +static void vf_balloon_fini(struct xe_gt *gt)

naming... since this is a cleanup of the vf_init_ggtt_balloons() then it
should be named in matching fashion, so

s/vf_balloon_fini/vf_fini_ggtt_balloons

> +{
> +	struct xe_tile *tile = gt_to_tile(gt);
> +
> +	xe_gt_assert(gt, IS_SRIOV_VF(gt_to_xe(gt)));
> +	xe_gt_assert(gt, !xe_gt_is_media_type(gt));
> +
> +	xe_ggtt_node_fini(tile->sriov.vf.ggtt_balloon[1]);
> +	xe_ggtt_node_fini(tile->sriov.vf.ggtt_balloon[0]);
> +}
> +
> +static void deballoon_and_fini_ggtt(struct drm_device *drm, void *arg)

hmm, naming again...

as this is a cleanup action for the xe_gt_sriov_vf_prepare_ggtt() then maybe
	s/deballoon_and_fini_ggtt/cleanup_ggtt
will suffice?

> +{
> +	struct xe_tile *tile = arg;
> +
> +	vf_deballoon_ggtt(tile->primary_gt);
> +	vf_balloon_fini(tile->primary_gt);
>  }
>  
>  /**
> @@ -655,11 +704,17 @@ int xe_gt_sriov_vf_prepare_ggtt(struct xe_gt *gt)
>  	if (xe_gt_is_media_type(gt))
>  		return 0;
>  
> -	err = vf_balloon_ggtt(gt);
> +	err = vf_init_ggtt_balloons(gt);
>  	if (err)
>  		return err;
>  
> -	return drmm_add_action_or_reset(&xe->drm, deballoon_ggtt, tile);
> +	err = vf_balloon_ggtt(gt);
> +	if (err) {
> +		vf_balloon_fini(gt);
> +		return err;
> +	}
> +
> +	return drmm_add_action_or_reset(&xe->drm, deballoon_and_fini_ggtt, tile);
>  }
>  
>  static int relay_action_handshake(struct xe_gt *gt, u32 *major, u32 *minor)
> diff --git a/drivers/gpu/drm/xe/xe_gt_sriov_vf.h b/drivers/gpu/drm/xe/xe_gt_sriov_vf.h
> index ba6c5d74e326..d717deb8af91 100644
> --- a/drivers/gpu/drm/xe/xe_gt_sriov_vf.h
> +++ b/drivers/gpu/drm/xe/xe_gt_sriov_vf.h
> @@ -18,6 +18,8 @@ int xe_gt_sriov_vf_query_config(struct xe_gt *gt);
>  int xe_gt_sriov_vf_connect(struct xe_gt *gt);
>  int xe_gt_sriov_vf_query_runtime(struct xe_gt *gt);
>  int xe_gt_sriov_vf_prepare_ggtt(struct xe_gt *gt);
> +int xe_gt_sriov_vf_balloon_ggtt_locked(struct xe_gt *gt);
> +void xe_gt_sriov_vf_deballoon_ggtt_locked(struct xe_gt *gt);
>  int xe_gt_sriov_vf_notify_resfix_done(struct xe_gt *gt);
>  void xe_gt_sriov_vf_migrated_event_handler(struct xe_gt *gt);
>