[PATCH v2 3/4] drm/xe/pf: Add functions to save and restore VF LMEM and CCS data

Matthew Brost matthew.brost at intel.com
Thu Oct 31 15:49:47 UTC 2024


On Thu, Oct 31, 2024 at 04:17:24PM +0100, Lukasz Laguna wrote:
> State of VF LMEM and corresponding CCS metadata needs to be saved and
> restored during migration of VM with attached dGPU VF. Add helpers that
> allow it. We will start using them in upcoming patches.
> 
> Signed-off-by: Lukasz Laguna <lukasz.laguna at intel.com>
> ---
>  drivers/gpu/drm/xe/xe_gt_sriov_pf_migration.c | 112 ++++++++++++++++++
>  drivers/gpu/drm/xe/xe_gt_sriov_pf_migration.h |   9 ++
>  2 files changed, 121 insertions(+)
> 
> diff --git a/drivers/gpu/drm/xe/xe_gt_sriov_pf_migration.c b/drivers/gpu/drm/xe/xe_gt_sriov_pf_migration.c
> index c712111aa30d..eca01c96a348 100644
> --- a/drivers/gpu/drm/xe/xe_gt_sriov_pf_migration.c
> +++ b/drivers/gpu/drm/xe/xe_gt_sriov_pf_migration.c
> @@ -12,6 +12,7 @@
>  #include "xe_gt_sriov_printk.h"
>  #include "xe_guc.h"
>  #include "xe_guc_ct.h"
> +#include "xe_migrate.h"
>  #include "xe_sriov.h"
>  
>  /* Return: number of dwords saved/restored/required or a negative error code on failure */
> @@ -382,6 +383,117 @@ ssize_t xe_gt_sriov_pf_migration_write_guc_state(struct xe_gt *gt, unsigned int
>  }
>  #endif /* CONFIG_DEBUG_FS */
>  
> +static struct dma_fence *pf_save_restore_lmem(struct xe_gt *gt, unsigned int vfid,
> +					      struct xe_bo *smem, u64 smem_offset,
> +					      struct xe_bo *ccs, u64 ccs_offset,
> +					      u64 lmem_offset, size_t size, bool save)
> +{
> +	struct xe_bo *lmem;
> +	struct dma_fence *ret = NULL;
> +
> +	xe_gt_assert(gt, IS_SRIOV_PF(gt_to_xe(gt)));
> +	xe_gt_assert(gt, vfid != PFID);
> +	xe_gt_assert(gt, vfid <= xe_sriov_pf_get_totalvfs(gt_to_xe(gt)));
> +	xe_gt_assert(gt, smem || ccs);
> +

For locking this, you and going explain and justify why you are open
coding the locking with nesting / trylocking rather than say using a
drm_exec loop to lock all the BOs.

> +	mutex_lock(xe_gt_sriov_pf_master_mutex(gt));
> +
> +	lmem = gt->sriov.pf.vfs[vfid].config.lmem_obj;
> +	if (!lmem) {
> +		mutex_unlock(xe_gt_sriov_pf_master_mutex(gt));
> +		return ERR_PTR(-ENXIO);
> +	}
> +
> +	if (!xe_bo_trylock(lmem)) {
> +		mutex_unlock(xe_gt_sriov_pf_master_mutex(gt));
> +		return ERR_PTR(-EBUSY);
> +	}

Gut reaction - this looks bad.

Here I'm assumng you use a trylock as xe_gt_sriov_pf_master_mutex is in
path of reclaim thus it cannot be nested with a dma-resv lock which is
reclaim safe...

e.g the locking order is:

fs_reclaim_acquire()
mutex_lock(xe_gt_sriov_pf_master_mutex())
mutex_unlock(xe_gt_sriov_pf_master_mutex())
fs_reclaim_release()

vs.
dma_resv_lock()
fs_reclaim_acquire()
fs_reclaim_release()
dma_resv_unlock()

Why do you need 'xe_bo_trylock(lmem)' under
'xe_gt_sriov_pf_master_mutex' but not 'xe_bo_trylock(smem)' or
'xe_bo_trylock(ccs)"?

All of this kinda screams to be the SRIOV locking might need a bit of
rework to either...

- Completely remove fs_reclaim_acquire from the outside of
  xe_gt_sriov_pf_master_mutex

- Or split that into 2 with a reclaim safe outer lock and a in path of
  reclaim inner lock

That being said I don't have a great understanding of the SRIOV locking,
so may be way off here. At a minimum I think it might be good to review
the SRIOV locking design with a few SRIOV people, Thomas, and myself
because on its surface this looks a bit scary when things like this need
to be done.

Matt

> +
> +	mutex_unlock(xe_gt_sriov_pf_master_mutex(gt));
> +
> +	if (smem) {
> +		if (!xe_bo_trylock(smem)) {
> +			ret = ERR_PTR(-EBUSY);
> +			goto err_lmem_unlock;
> +		}
> +	}
> +	if (ccs) {
> +		if (!xe_bo_trylock(ccs)) {
> +			ret = ERR_PTR(-EBUSY);
> +			goto err_smem_unlock;
> +		}
> +	}
> +
> +	ret = xe_migrate_raw_vram_copy(lmem, lmem_offset, smem, smem_offset, ccs, ccs_offset,
> +				       size, save);
> +
> +	if (ccs)
> +		xe_bo_unlock(ccs);
> +err_smem_unlock:
> +	if (smem)
> +		xe_bo_unlock(smem);
> +err_lmem_unlock:
> +	xe_bo_unlock(lmem);
> +
> +	return ret;
> +}
> +
> +/**
> + * xe_gt_sriov_pf_migration_save_lmem() - Save a VF LMEM and corresponding CCS.
> + * @gt: the &xe_gt
> + * @vfid: the VF identifier
> + * @smem: the sysmem BO to save LMEM data to (NULL if not needed)
> + * @smem_offset: the offset of @smem BO
> + * @ccs: the sysmem BO to save CCS data to (NULL if not needed)
> + * @ccs_offset: the offset of @ccs BO
> + * @lmem_offset: the offset of VF LMEM chunk
> + * @size: the size of VF LMEM chunk
> + *
> + * This function is for PF only.
> + *
> + * This functions saves VF LMEM and corresponding CCS data into sysmem buffer
> + * objects.
> + *
> + * Return: Pointer to a dma_fence representing the last copy batch, or
> + * an error pointer on failure.
> + */
> +struct dma_fence *xe_gt_sriov_pf_migration_save_lmem(struct xe_gt *gt, unsigned int vfid,
> +						     struct xe_bo *smem, u64 smem_offset,
> +						     struct xe_bo *ccs, u64 ccs_offset,
> +						     u64 lmem_offset, size_t size)
> +{
> +	return pf_save_restore_lmem(gt, vfid, smem, smem_offset, ccs, ccs_offset,
> +				    lmem_offset, size, true);
> +}
> +
> +/**
> + * xe_gt_sriov_pf_migration_restore_lmem() - Restore a VF LMEM and corresponding CCS.
> + * @gt: the &xe_gt
> + * @vfid: the VF identifier
> + * @smem: the sysmem BO to restore LMEM data from (NULL if not needed)
> + * @smem_offset: the offset of @smem BO
> + * @ccs: the sysmem BO to restore CCS data from (NULL if not needed)
> + * @ccs_offset: the offset of @ccs BO
> + * @lmem_offset: the offset of VF LMEM chunk
> + * @size: the size of VF LMEM chunk
> + *
> + * This function is for PF only.
> + *
> + * This functions restores VF LMEM and corresponding CCS data from sysmem
> + * buffer objects.
> + *
> + * Return: Pointer to a dma_fence representing the last copy batch, or
> + * an error pointer on failure.
> + */
> +struct dma_fence *xe_gt_sriov_pf_migration_restore_lmem(struct xe_gt *gt, unsigned int vfid,
> +							struct xe_bo *smem, u64 smem_offset,
> +							struct xe_bo *ccs, u64 ccs_offset,
> +							u64 lmem_offset, size_t size)
> +{
> +	return pf_save_restore_lmem(gt, vfid, smem, smem_offset, ccs, ccs_offset,
> +				    lmem_offset, size, false);
> +}
> +
>  static bool pf_check_migration_support(struct xe_gt *gt)
>  {
>  	/* GuC 70.25 with save/restore v2 is required */
> diff --git a/drivers/gpu/drm/xe/xe_gt_sriov_pf_migration.h b/drivers/gpu/drm/xe/xe_gt_sriov_pf_migration.h
> index 09faeae00ddb..a4301574d92c 100644
> --- a/drivers/gpu/drm/xe/xe_gt_sriov_pf_migration.h
> +++ b/drivers/gpu/drm/xe/xe_gt_sriov_pf_migration.h
> @@ -8,11 +8,20 @@
>  
>  #include <linux/types.h>
>  
> +struct xe_bo;
>  struct xe_gt;
>  
>  int xe_gt_sriov_pf_migration_init(struct xe_gt *gt);
>  int xe_gt_sriov_pf_migration_save_guc_state(struct xe_gt *gt, unsigned int vfid);
>  int xe_gt_sriov_pf_migration_restore_guc_state(struct xe_gt *gt, unsigned int vfid);
> +struct dma_fence *xe_gt_sriov_pf_migration_save_lmem(struct xe_gt *gt, unsigned int vfid,
> +						     struct xe_bo *smem, u64 smem_offset,
> +						     struct xe_bo *ccs, u64 ccs_offset,
> +						     u64 lmem_offset, size_t size);
> +struct dma_fence *xe_gt_sriov_pf_migration_restore_lmem(struct xe_gt *gt, unsigned int vfid,
> +							struct xe_bo *smem, u64 smem_offset,
> +							struct xe_bo *ccs, u64 ccs_offset,
> +							u64 lmem_offset, size_t size);
>  
>  #ifdef CONFIG_DEBUG_FS
>  ssize_t xe_gt_sriov_pf_migration_read_guc_state(struct xe_gt *gt, unsigned int vfid,
> -- 
> 2.40.0
> 


More information about the Intel-xe mailing list