[PATCH v7 1/4] drm/xe/vf: Divide GGTT ballooning into allocation and insertion

Tue Apr 8 11:59:56 UTC 2025


On 03.04.2025 20:40, Tomasz Lis wrote:
> The balloon nodes, which are used to fill areas of GGTT inaccessible
> for a specific VF, were allocated and inserted into GGTT within one
> function. To be able to re-use that insertion code during VF
> migration recovery, we need to split it.
> 
> This patch separates allocation (init/fini functs) from the insertion
> of balloons (balloon/deballoon functs). Locks are also moved to ensure
> calls from post-migration recovery worker will not cause a deadlock.
> 
> v2: Moved declarations to proper header
> v3: Rephrased description, introduced "_locked" versions of some
>   functs, more lockdep checks, some functions renamed, altered error
>   handling, added missing kerneldocs.
> 
> Signed-off-by: Tomasz Lis <tomasz.lis at intel.com>
> ---
>  drivers/gpu/drm/xe/xe_ggtt.c        |  11 +--
>  drivers/gpu/drm/xe/xe_gt_sriov_vf.c | 102 +++++++++++++++++++++-------
>  drivers/gpu/drm/xe/xe_gt_sriov_vf.h |   2 +
>  3 files changed, 82 insertions(+), 33 deletions(-)
> 
> diff --git a/drivers/gpu/drm/xe/xe_ggtt.c b/drivers/gpu/drm/xe/xe_ggtt.c
> index 5fcb2b4c2c13..769a8dc9be6e 100644
> --- a/drivers/gpu/drm/xe/xe_ggtt.c
> +++ b/drivers/gpu/drm/xe/xe_ggtt.c
> @@ -447,14 +447,13 @@ int xe_ggtt_node_insert_balloon(struct xe_ggtt_node *node, u64 start, u64 end)
>  	xe_tile_assert(ggtt->tile, IS_ALIGNED(start, XE_PAGE_SIZE));
>  	xe_tile_assert(ggtt->tile, IS_ALIGNED(end, XE_PAGE_SIZE));
>  	xe_tile_assert(ggtt->tile, !drm_mm_node_allocated(&node->base));
> +	lockdep_assert_held(&ggtt->lock);

since lock is now prerequisite, this function shall be renamed to:

	xe_ggtt_node_insert_balloon_locked()

likely with update to kerneldoc like:

	"To be used in cases where ggtt->lock is already taken.

>  
>  	node->base.color = 0;
>  	node->base.start = start;
>  	node->base.size = end - start;
>  
> -	mutex_lock(&ggtt->lock);
>  	err = drm_mm_reserve_node(&ggtt->mm, &node->base);
> -	mutex_unlock(&ggtt->lock);
>  
>  	if (xe_gt_WARN(ggtt->tile->primary_gt, err,
>  		       "Failed to balloon GGTT %#llx-%#llx (%pe)\n",
> @@ -477,16 +476,12 @@ void xe_ggtt_node_remove_balloon(struct xe_ggtt_node *node)

same as in earlier comment, rename function to:

	xe_ggtt_node_remove_balloon_locked()

>  		return;
>  
>  	if (!drm_mm_node_allocated(&node->base))
> -		goto free_node;
> +		return;
>  
> +	lockdep_assert_held(&node->ggtt->lock);

this should be earlier in the function, as by API SLA lock must be
always taken, not only when node is allocated

>  	xe_ggtt_dump_node(node->ggtt, &node->base, "remove-balloon");
>  
> -	mutex_lock(&node->ggtt->lock);
>  	drm_mm_remove_node(&node->base);
> -	mutex_unlock(&node->ggtt->lock);
> -
> -free_node:
> -	xe_ggtt_node_fini(node);
>  }
>  
>  /**
> diff --git a/drivers/gpu/drm/xe/xe_gt_sriov_vf.c b/drivers/gpu/drm/xe/xe_gt_sriov_vf.c
> index a439261bf4d7..c3ca33725161 100644
> --- a/drivers/gpu/drm/xe/xe_gt_sriov_vf.c
> +++ b/drivers/gpu/drm/xe/xe_gt_sriov_vf.c
> @@ -560,35 +560,38 @@ u64 xe_gt_sriov_vf_lmem(struct xe_gt *gt)
>  	return gt->sriov.vf.self_config.lmem_size;
>  }
>  
> -static struct xe_ggtt_node *
> -vf_balloon_ggtt_node(struct xe_ggtt *ggtt, u64 start, u64 end)
> +static int vf_init_ggtt_balloons(struct xe_gt *gt)
>  {
> -	struct xe_ggtt_node *node;
> -	int err;
> +	struct xe_tile *tile = gt_to_tile(gt);
> +	struct xe_ggtt *ggtt = tile->mem.ggtt;
>  
> -	node = xe_ggtt_node_init(ggtt);
> -	if (IS_ERR(node))
> -		return node;
> +	tile->sriov.vf.ggtt_balloon[0] = xe_ggtt_node_init(ggtt);
> +	if (IS_ERR(tile->sriov.vf.ggtt_balloon[0]))
> +		return PTR_ERR(tile->sriov.vf.ggtt_balloon[0]);
>  
> -	err = xe_ggtt_node_insert_balloon(node, start, end);
> -	if (err) {
> -		xe_ggtt_node_fini(node);
> -		return ERR_PTR(err);
> -	}
> +	tile->sriov.vf.ggtt_balloon[1] = xe_ggtt_node_init(ggtt);
> +	if (IS_ERR(tile->sriov.vf.ggtt_balloon[1]))
> +		return PTR_ERR(tile->sriov.vf.ggtt_balloon[1]);

what about ggtt_balloon[0] ? no need to fini() it ?
>  
> -	return node;
> +	return 0;
>  }
>  
> -static int vf_balloon_ggtt(struct xe_gt *gt)
> +/**
> + * xe_gt_sriov_vf_balloon_ggtt_locked - Insert balloon nodes to limit used GGTT address range.
> + * @gt: the &xe_gt struct instance
> + * Return: 0 on success or a negative error code on failure.
> + */
> +int xe_gt_sriov_vf_balloon_ggtt_locked(struct xe_gt *gt)
>  {
>  	struct xe_gt_sriov_vf_selfconfig *config = &gt->sriov.vf.self_config;
>  	struct xe_tile *tile = gt_to_tile(gt);
> -	struct xe_ggtt *ggtt = tile->mem.ggtt;
>  	struct xe_device *xe = gt_to_xe(gt);
>  	u64 start, end;
> +	int err;
>  
>  	xe_gt_assert(gt, IS_SRIOV_VF(xe));
>  	xe_gt_assert(gt, !xe_gt_is_media_type(gt));
> +	lockdep_assert_held(&tile->mem.ggtt->lock);
>  
>  	if (!config->ggtt_size)
>  		return -ENODATA;
> @@ -611,33 +614,76 @@ static int vf_balloon_ggtt(struct xe_gt *gt)
>  	start = xe_wopcm_size(xe);
>  	end = config->ggtt_base;
>  	if (end != start) {
> -		tile->sriov.vf.ggtt_balloon[0] = vf_balloon_ggtt_node(ggtt, start, end);
> -		if (IS_ERR(tile->sriov.vf.ggtt_balloon[0]))
> -			return PTR_ERR(tile->sriov.vf.ggtt_balloon[0]);
> +		err = xe_ggtt_node_insert_balloon(tile->sriov.vf.ggtt_balloon[0], start, end);
> +		if (err)
> +			return err;
>  	}
>  
>  	start = config->ggtt_base + config->ggtt_size;
>  	end = GUC_GGTT_TOP;
>  	if (end != start) {
> -		tile->sriov.vf.ggtt_balloon[1] = vf_balloon_ggtt_node(ggtt, start, end);
> -		if (IS_ERR(tile->sriov.vf.ggtt_balloon[1])) {
> +		err = xe_ggtt_node_insert_balloon(tile->sriov.vf.ggtt_balloon[1], start, end);
> +		if (err) {
>  			xe_ggtt_node_remove_balloon(tile->sriov.vf.ggtt_balloon[0]);
> -			return PTR_ERR(tile->sriov.vf.ggtt_balloon[1]);
> +			return err;
>  		}
>  	}
>  
>  	return 0;
>  }
>  
> -static void deballoon_ggtt(struct drm_device *drm, void *arg)
> +static int vf_balloon_ggtt(struct xe_gt *gt)
>  {
> -	struct xe_tile *tile = arg;
> +	struct xe_ggtt *ggtt = gt_to_tile(gt)->mem.ggtt;
> +	int err;
> +
> +	mutex_lock(&ggtt->lock);
> +	err = xe_gt_sriov_vf_balloon_ggtt_locked(gt);
> +	mutex_unlock(&ggtt->lock);
> +
> +	return err;
> +}
> +
> +/**
> + * xe_gt_sriov_vf_deballoon_ggtt_locked - Remove balloon nodes which limited used address renge.
> + * @gt: the &xe_gt struct instance
> + */
> +void xe_gt_sriov_vf_deballoon_ggtt_locked(struct xe_gt *gt)
> +{
> +	struct xe_tile *tile = gt_to_tile(gt);
>  
>  	xe_tile_assert(tile, IS_SRIOV_VF(tile_to_xe(tile)));
> +	lockdep_assert_held(&tile->mem.ggtt->lock);

nit: IMO this is redundant as lock shall be asserted in
xe_ggtt_node_remove_balloon_locked()

> +
>  	xe_ggtt_node_remove_balloon(tile->sriov.vf.ggtt_balloon[1]);
>  	xe_ggtt_node_remove_balloon(tile->sriov.vf.ggtt_balloon[0]);
>  }
>  
> +static void vf_deballoon_ggtt(struct xe_gt *gt)

hmm, in this patch you don't really need to split (de)balloon logic into
locked/unlocked parts, so maybe keep it as it was and introduce such
split when really needed

also it's quite unusual that unlocked part is named in completely
different fashion than locked, can't it be (later) defined as pair of:

	xe_gt_sriov_vf_deballoon_ggtt_locked()
	xe_gt_sriov_vf_deballoon_ggtt()

and

	xe_gt_sriov_vf_balloon_ggtt_locked()
	xe_gt_sriov_vf_balloon_ggtt()

> +{
> +	struct xe_tile *tile = gt_to_tile(gt);
> +
> +	mutex_lock(&tile->mem.ggtt->lock);
> +	xe_gt_sriov_vf_deballoon_ggtt_locked(gt);
> +	mutex_unlock(&tile->mem.ggtt->lock);
> +}
> +
> +static void vf_balloon_fini(struct xe_gt *gt)
> +{
> +	struct xe_tile *tile = gt_to_tile(gt);

missing asserts:

  	xe_gt_assert(gt, IS_SRIOV_VF(xe));
  	xe_gt_assert(gt, !xe_gt_is_media_type(gt));

> +
> +	xe_ggtt_node_fini(tile->sriov.vf.ggtt_balloon[1]);
> +	xe_ggtt_node_fini(tile->sriov.vf.ggtt_balloon[0]);
> +}
> +
> +static void deballoon_and_fini_ggtt(struct drm_device *drm, void *arg)
> +{
> +	struct xe_tile *tile = arg;
> +
> +	vf_deballoon_ggtt(tile->primary_gt);
> +	vf_balloon_fini(tile->primary_gt);
> +}
> +
>  /**
>   * xe_gt_sriov_vf_prepare_ggtt - Prepare a VF's GGTT configuration.
>   * @gt: the &xe_gt
> @@ -655,11 +701,17 @@ int xe_gt_sriov_vf_prepare_ggtt(struct xe_gt *gt)
>  	if (xe_gt_is_media_type(gt))
>  		return 0;
>  
> -	err = vf_balloon_ggtt(gt);
> +	err = vf_init_ggtt_balloons(gt);
>  	if (err)
>  		return err;
>  
> -	return drmm_add_action_or_reset(&xe->drm, deballoon_ggtt, tile);
> +	err = vf_balloon_ggtt(gt);
> +	if (err) {
> +		vf_balloon_fini(gt);
> +		return err;
> +	}
> +
> +	return drmm_add_action_or_reset(&xe->drm, deballoon_and_fini_ggtt, tile);
>  }
>  
>  static int relay_action_handshake(struct xe_gt *gt, u32 *major, u32 *minor)
> diff --git a/drivers/gpu/drm/xe/xe_gt_sriov_vf.h b/drivers/gpu/drm/xe/xe_gt_sriov_vf.h
> index ba6c5d74e326..d717deb8af91 100644
> --- a/drivers/gpu/drm/xe/xe_gt_sriov_vf.h
> +++ b/drivers/gpu/drm/xe/xe_gt_sriov_vf.h
> @@ -18,6 +18,8 @@ int xe_gt_sriov_vf_query_config(struct xe_gt *gt);
>  int xe_gt_sriov_vf_connect(struct xe_gt *gt);
>  int xe_gt_sriov_vf_query_runtime(struct xe_gt *gt);
>  int xe_gt_sriov_vf_prepare_ggtt(struct xe_gt *gt);
> +int xe_gt_sriov_vf_balloon_ggtt_locked(struct xe_gt *gt);
> +void xe_gt_sriov_vf_deballoon_ggtt_locked(struct xe_gt *gt);
>  int xe_gt_sriov_vf_notify_resfix_done(struct xe_gt *gt);
>  void xe_gt_sriov_vf_migrated_event_handler(struct xe_gt *gt);
>