[PATCH v8 1/4] drm/xe/vf: Divide GGTT ballooning into allocation and insertion
Michal Wajdeczko
michal.wajdeczko at intel.com
Thu Apr 10 16:54:27 UTC 2025
On 09.04.2025 23:13, Tomasz Lis wrote:
> The balloon nodes, which are used to fill areas of GGTT inaccessible
> for a specific VF, were allocated and inserted into GGTT within one
> function. To be able to re-use that insertion code during VF
> migration recovery, we need to split it.
>
> This patch separates allocation (init/fini functs) from the insertion
> of balloons (balloon/deballoon functs). Locks are also moved to ensure
> calls from post-migration recovery worker will not cause a deadlock.
>
> v2: Moved declarations to proper header
> v3: Rephrased description, introduced "_locked" versions of some
> functs, more lockdep checks, some functions renamed, altered error
> handling, added missing kerneldocs.
> v4: Suffixed more functs with `_locked`, moved lockdep asserts,
> fixed finalization in error path, added asserts
>
> Signed-off-by: Tomasz Lis <tomasz.lis at intel.com>
> Cc: Michal Wajdeczko <michal.wajdeczko at intel.com>
> ---
> drivers/gpu/drm/xe/xe_ggtt.c | 34 ++++-----
> drivers/gpu/drm/xe/xe_ggtt.h | 6 +-
> drivers/gpu/drm/xe/xe_gt_sriov_vf.c | 109 +++++++++++++++++++++-------
> drivers/gpu/drm/xe/xe_gt_sriov_vf.h | 2 +
> 4 files changed, 103 insertions(+), 48 deletions(-)
>
> diff --git a/drivers/gpu/drm/xe/xe_ggtt.c b/drivers/gpu/drm/xe/xe_ggtt.c
> index 7062115909f2..e7d474bd0cfe 100644
> --- a/drivers/gpu/drm/xe/xe_ggtt.c
> +++ b/drivers/gpu/drm/xe/xe_ggtt.c
> @@ -1,4 +1,4 @@
> -// SPDX-License-Identifier: MIT
> +/// SPDX-License-Identifier: MIT
what's this?
> /*
> * Copyright © 2021 Intel Corporation
> */
> @@ -429,16 +429,17 @@ static void xe_ggtt_dump_node(struct xe_ggtt *ggtt,
> }
>
> /**
> - * xe_ggtt_node_insert_balloon - prevent allocation of specified GGTT addresses
> + * xe_ggtt_node_insert_balloon_locked - prevent allocation of specified GGTT addresses
> * @node: the &xe_ggtt_node to hold reserved GGTT node
> * @start: the starting GGTT address of the reserved region
> * @end: then end GGTT address of the reserved region
> *
> - * Use xe_ggtt_node_remove_balloon() to release a reserved GGTT node.
> + * To be used in cases where ggtt->lock is already taken.
> + * Use xe_ggtt_node_remove_balloon_locked() to release a reserved GGTT node.
> *
> * Return: 0 on success or a negative error code on failure.
> */
> -int xe_ggtt_node_insert_balloon(struct xe_ggtt_node *node, u64 start, u64 end)
> +int xe_ggtt_node_insert_balloon_locked(struct xe_ggtt_node *node, u64 start, u64 end)
> {
> struct xe_ggtt *ggtt = node->ggtt;
> int err;
> @@ -447,14 +448,13 @@ int xe_ggtt_node_insert_balloon(struct xe_ggtt_node *node, u64 start, u64 end)
> xe_tile_assert(ggtt->tile, IS_ALIGNED(start, XE_PAGE_SIZE));
> xe_tile_assert(ggtt->tile, IS_ALIGNED(end, XE_PAGE_SIZE));
> xe_tile_assert(ggtt->tile, !drm_mm_node_allocated(&node->base));
> + lockdep_assert_held(&ggtt->lock);
>
> node->base.color = 0;
> node->base.start = start;
> node->base.size = end - start;
>
> - mutex_lock(&ggtt->lock);
> err = drm_mm_reserve_node(&ggtt->mm, &node->base);
> - mutex_unlock(&ggtt->lock);
>
> if (xe_gt_WARN(ggtt->tile->primary_gt, err,
> "Failed to balloon GGTT %#llx-%#llx (%pe)\n",
> @@ -466,27 +466,25 @@ int xe_ggtt_node_insert_balloon(struct xe_ggtt_node *node, u64 start, u64 end)
> }
>
> /**
> - * xe_ggtt_node_remove_balloon - release a reserved GGTT region
> + * xe_ggtt_node_remove_balloon_locked - release a reserved GGTT region
> * @node: the &xe_ggtt_node with reserved GGTT region
> *
> - * See xe_ggtt_node_insert_balloon() for details.
> + * To be used in cases where ggtt->lock is already taken.
> + * See xe_ggtt_node_insert_balloon_locked() for details.
> */
> -void xe_ggtt_node_remove_balloon(struct xe_ggtt_node *node)
> +void xe_ggtt_node_remove_balloon_locked(struct xe_ggtt_node *node)
> {
> + lockdep_assert_held(&node->ggtt->lock);
> +
> if (!node || !node->ggtt)
> return;
>
> if (!drm_mm_node_allocated(&node->base))
nit: we should use xe_ggtt_node_allocated() instead
> - goto free_node;
> + return;
>
> xe_ggtt_dump_node(node->ggtt, &node->base, "remove-balloon");
>
> - mutex_lock(&node->ggtt->lock);
> drm_mm_remove_node(&node->base);
> - mutex_unlock(&node->ggtt->lock);
> -
> -free_node:
> - xe_ggtt_node_fini(node);
> }
>
> /**
> @@ -539,10 +537,10 @@ int xe_ggtt_node_insert(struct xe_ggtt_node *node, u32 size, u32 align)
> *
> * This function will allocated the struct %xe_ggtt_node and return it's pointer.
since you're around, can you fix above s/allocated/allocate
> * This struct will then be freed after the node removal upon xe_ggtt_node_remove()
> - * or xe_ggtt_node_remove_balloon().
> + * or xe_ggtt_node_remove_balloon_locked().
> * Having %xe_ggtt_node struct allocated doesn't mean that the node is already allocated
> * in GGTT. Only the xe_ggtt_node_insert(), xe_ggtt_node_insert_locked(),
> - * xe_ggtt_node_insert_balloon() will ensure the node is inserted or reserved in GGTT.
> + * xe_ggtt_node_insert_balloon_locked() will ensure the node is inserted or reserved in GGTT.
> *
> * Return: A pointer to %xe_ggtt_node struct on success. An ERR_PTR otherwise.
> **/
> @@ -564,7 +562,7 @@ struct xe_ggtt_node *xe_ggtt_node_init(struct xe_ggtt *ggtt)
> * @node: the &xe_ggtt_node to be freed
> *
> * If anything went wrong with either xe_ggtt_node_insert(), xe_ggtt_node_insert_locked(),
> - * or xe_ggtt_node_insert_balloon(); and this @node is not going to be reused, then,
> + * or xe_ggtt_node_insert_balloon_locked(); and this @node is not going to be reused, then,
> * this function needs to be called to free the %xe_ggtt_node struct
> **/
> void xe_ggtt_node_fini(struct xe_ggtt_node *node)
> diff --git a/drivers/gpu/drm/xe/xe_ggtt.h b/drivers/gpu/drm/xe/xe_ggtt.h
> index 27e7d67de004..d468af96b465 100644
> --- a/drivers/gpu/drm/xe/xe_ggtt.h
> +++ b/drivers/gpu/drm/xe/xe_ggtt.h
> @@ -15,9 +15,9 @@ int xe_ggtt_init(struct xe_ggtt *ggtt);
>
> struct xe_ggtt_node *xe_ggtt_node_init(struct xe_ggtt *ggtt);
> void xe_ggtt_node_fini(struct xe_ggtt_node *node);
> -int xe_ggtt_node_insert_balloon(struct xe_ggtt_node *node,
> - u64 start, u64 size);
> -void xe_ggtt_node_remove_balloon(struct xe_ggtt_node *node);
> +int xe_ggtt_node_insert_balloon_locked(struct xe_ggtt_node *node,
> + u64 start, u64 size);
> +void xe_ggtt_node_remove_balloon_locked(struct xe_ggtt_node *node);
>
> int xe_ggtt_node_insert(struct xe_ggtt_node *node, u32 size, u32 align);
> int xe_ggtt_node_insert_locked(struct xe_ggtt_node *node,
> diff --git a/drivers/gpu/drm/xe/xe_gt_sriov_vf.c b/drivers/gpu/drm/xe/xe_gt_sriov_vf.c
> index a439261bf4d7..d2f6bfb3aacf 100644
> --- a/drivers/gpu/drm/xe/xe_gt_sriov_vf.c
> +++ b/drivers/gpu/drm/xe/xe_gt_sriov_vf.c
> @@ -560,35 +560,40 @@ u64 xe_gt_sriov_vf_lmem(struct xe_gt *gt)
> return gt->sriov.vf.self_config.lmem_size;
> }
>
> -static struct xe_ggtt_node *
> -vf_balloon_ggtt_node(struct xe_ggtt *ggtt, u64 start, u64 end)
> +static int vf_init_ggtt_balloons(struct xe_gt *gt)
> {
> - struct xe_ggtt_node *node;
> - int err;
> + struct xe_tile *tile = gt_to_tile(gt);
> + struct xe_ggtt *ggtt = tile->mem.ggtt;
>
> - node = xe_ggtt_node_init(ggtt);
> - if (IS_ERR(node))
> - return node;
> + tile->sriov.vf.ggtt_balloon[0] = xe_ggtt_node_init(ggtt);
> + if (IS_ERR(tile->sriov.vf.ggtt_balloon[0]))
> + return PTR_ERR(tile->sriov.vf.ggtt_balloon[0]);
as mentioned earlier in similar case, since you are accessing vf
specific fields we should have asserts in this function:
xe_gt_assert(gt, IS_SRIOV_VF(xe));
xe_gt_assert(gt, !xe_gt_is_media_type(gt));
>
> - err = xe_ggtt_node_insert_balloon(node, start, end);
> - if (err) {
> - xe_ggtt_node_fini(node);
> - return ERR_PTR(err);
> + tile->sriov.vf.ggtt_balloon[1] = xe_ggtt_node_init(ggtt);
> + if (IS_ERR(tile->sriov.vf.ggtt_balloon[1])) {
> + xe_ggtt_node_fini(tile->sriov.vf.ggtt_balloon[0]);
> + return PTR_ERR(tile->sriov.vf.ggtt_balloon[1]);
> }
>
> - return node;
> + return 0;
> }
>
> -static int vf_balloon_ggtt(struct xe_gt *gt)
> +/**
> + * xe_gt_sriov_vf_balloon_ggtt_locked - Insert balloon nodes to limit used GGTT address range.
> + * @gt: the &xe_gt struct instance
> + * Return: 0 on success or a negative error code on failure.
> + */
> +int xe_gt_sriov_vf_balloon_ggtt_locked(struct xe_gt *gt)
> {
> struct xe_gt_sriov_vf_selfconfig *config = >->sriov.vf.self_config;
> struct xe_tile *tile = gt_to_tile(gt);
> - struct xe_ggtt *ggtt = tile->mem.ggtt;
> struct xe_device *xe = gt_to_xe(gt);
> u64 start, end;
> + int err;
>
> xe_gt_assert(gt, IS_SRIOV_VF(xe));
> xe_gt_assert(gt, !xe_gt_is_media_type(gt));
> + lockdep_assert_held(&tile->mem.ggtt->lock);
>
> if (!config->ggtt_size)
> return -ENODATA;
> @@ -611,31 +616,75 @@ static int vf_balloon_ggtt(struct xe_gt *gt)
> start = xe_wopcm_size(xe);
> end = config->ggtt_base;
> if (end != start) {
> - tile->sriov.vf.ggtt_balloon[0] = vf_balloon_ggtt_node(ggtt, start, end);
> - if (IS_ERR(tile->sriov.vf.ggtt_balloon[0]))
> - return PTR_ERR(tile->sriov.vf.ggtt_balloon[0]);
> + err = xe_ggtt_node_insert_balloon_locked(tile->sriov.vf.ggtt_balloon[0], start, end);
> + if (err)
> + return err;
> }
>
> start = config->ggtt_base + config->ggtt_size;
> end = GUC_GGTT_TOP;
> if (end != start) {
> - tile->sriov.vf.ggtt_balloon[1] = vf_balloon_ggtt_node(ggtt, start, end);
> - if (IS_ERR(tile->sriov.vf.ggtt_balloon[1])) {
> - xe_ggtt_node_remove_balloon(tile->sriov.vf.ggtt_balloon[0]);
> - return PTR_ERR(tile->sriov.vf.ggtt_balloon[1]);
> + err = xe_ggtt_node_insert_balloon_locked(tile->sriov.vf.ggtt_balloon[1], start, end);
> + if (err) {
> + xe_ggtt_node_remove_balloon_locked(tile->sriov.vf.ggtt_balloon[0]);
> + return err;
> }
> }
>
> return 0;
> }
>
> -static void deballoon_ggtt(struct drm_device *drm, void *arg)
> +static int vf_balloon_ggtt(struct xe_gt *gt)
> {
> - struct xe_tile *tile = arg;
> + struct xe_ggtt *ggtt = gt_to_tile(gt)->mem.ggtt;
> + int err;
> +
> + mutex_lock(&ggtt->lock);
> + err = xe_gt_sriov_vf_balloon_ggtt_locked(gt);
> + mutex_unlock(&ggtt->lock);
> +
> + return err;
> +}
> +
> +/**
> + * xe_gt_sriov_vf_deballoon_ggtt_locked - Remove balloon nodes which limited used address renge.
typo
hmm, but I'm not sure we need "which ..." part anyway
> + * @gt: the &xe_gt struct instance
> + */
> +void xe_gt_sriov_vf_deballoon_ggtt_locked(struct xe_gt *gt)
> +{
> + struct xe_tile *tile = gt_to_tile(gt);
>
> xe_tile_assert(tile, IS_SRIOV_VF(tile_to_xe(tile)));
> - xe_ggtt_node_remove_balloon(tile->sriov.vf.ggtt_balloon[1]);
> - xe_ggtt_node_remove_balloon(tile->sriov.vf.ggtt_balloon[0]);
> + xe_ggtt_node_remove_balloon_locked(tile->sriov.vf.ggtt_balloon[1]);
> + xe_ggtt_node_remove_balloon_locked(tile->sriov.vf.ggtt_balloon[0]);
> +}
> +
> +static void vf_deballoon_ggtt(struct xe_gt *gt)
> +{
> + struct xe_tile *tile = gt_to_tile(gt);
> +
> + mutex_lock(&tile->mem.ggtt->lock);
> + xe_gt_sriov_vf_deballoon_ggtt_locked(gt);
> + mutex_unlock(&tile->mem.ggtt->lock);
> +}
> +
> +static void vf_balloon_fini(struct xe_gt *gt)
naming... since this is a cleanup of the vf_init_ggtt_balloons() then it
should be named in matching fashion, so
s/vf_balloon_fini/vf_fini_ggtt_balloons
> +{
> + struct xe_tile *tile = gt_to_tile(gt);
> +
> + xe_gt_assert(gt, IS_SRIOV_VF(gt_to_xe(gt)));
> + xe_gt_assert(gt, !xe_gt_is_media_type(gt));
> +
> + xe_ggtt_node_fini(tile->sriov.vf.ggtt_balloon[1]);
> + xe_ggtt_node_fini(tile->sriov.vf.ggtt_balloon[0]);
> +}
> +
> +static void deballoon_and_fini_ggtt(struct drm_device *drm, void *arg)
hmm, naming again...
as this is a cleanup action for the xe_gt_sriov_vf_prepare_ggtt() then maybe
s/deballoon_and_fini_ggtt/cleanup_ggtt
will suffice?
> +{
> + struct xe_tile *tile = arg;
> +
> + vf_deballoon_ggtt(tile->primary_gt);
> + vf_balloon_fini(tile->primary_gt);
> }
>
> /**
> @@ -655,11 +704,17 @@ int xe_gt_sriov_vf_prepare_ggtt(struct xe_gt *gt)
> if (xe_gt_is_media_type(gt))
> return 0;
>
> - err = vf_balloon_ggtt(gt);
> + err = vf_init_ggtt_balloons(gt);
> if (err)
> return err;
>
> - return drmm_add_action_or_reset(&xe->drm, deballoon_ggtt, tile);
> + err = vf_balloon_ggtt(gt);
> + if (err) {
> + vf_balloon_fini(gt);
> + return err;
> + }
> +
> + return drmm_add_action_or_reset(&xe->drm, deballoon_and_fini_ggtt, tile);
> }
>
> static int relay_action_handshake(struct xe_gt *gt, u32 *major, u32 *minor)
> diff --git a/drivers/gpu/drm/xe/xe_gt_sriov_vf.h b/drivers/gpu/drm/xe/xe_gt_sriov_vf.h
> index ba6c5d74e326..d717deb8af91 100644
> --- a/drivers/gpu/drm/xe/xe_gt_sriov_vf.h
> +++ b/drivers/gpu/drm/xe/xe_gt_sriov_vf.h
> @@ -18,6 +18,8 @@ int xe_gt_sriov_vf_query_config(struct xe_gt *gt);
> int xe_gt_sriov_vf_connect(struct xe_gt *gt);
> int xe_gt_sriov_vf_query_runtime(struct xe_gt *gt);
> int xe_gt_sriov_vf_prepare_ggtt(struct xe_gt *gt);
> +int xe_gt_sriov_vf_balloon_ggtt_locked(struct xe_gt *gt);
> +void xe_gt_sriov_vf_deballoon_ggtt_locked(struct xe_gt *gt);
> int xe_gt_sriov_vf_notify_resfix_done(struct xe_gt *gt);
> void xe_gt_sriov_vf_migrated_event_handler(struct xe_gt *gt);
>
More information about the Intel-xe
mailing list