[PATCH 3/3] drm/xe: Add XE_BO_FLAG_PINNED_NEED_LOAD

Matthew Brost matthew.brost at intel.com
Tue Oct 29 01:07:18 UTC 2024


On Mon, Oct 28, 2024 at 05:32:24PM -0700, Matthew Brost wrote:
> Not all pinned BOs need a memcpy to restored, rather just ones which are
> required for resume to complete. Add XE_BO_FLAG_PINNED_NEED_LOAD which
> indicates a BO needs to restored via a memcpy prior to loading GuC. This
> should speedup resume / d3cold exit slightly as the GPU can be used to
> copy some of the pinned BOs.
> 
> Marking most kernel BOs and migration LRC with
> XE_BO_FLAG_PINNED_NEED_LOAD to be safe. This could be trimmed down in
> future.
> 

This patch seems buggy, going to drop for now as most of perf
improvements are from the WONTNEED patch. Maybe we can revive later.

Matt

> Signed-off-by: Matthew Brost <matthew.brost at intel.com>
> ---
>  drivers/gpu/drm/xe/xe_bo.c                 |  10 +-
>  drivers/gpu/drm/xe/xe_bo.h                 |   1 +
>  drivers/gpu/drm/xe/xe_bo_evict.c           | 117 ++++++++++++++++-----
>  drivers/gpu/drm/xe/xe_ggtt.c               |   2 +-
>  drivers/gpu/drm/xe/xe_gsc.c                |   1 +
>  drivers/gpu/drm/xe/xe_gsc_proxy.c          |   1 +
>  drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c |   1 +
>  drivers/gpu/drm/xe/xe_huc.c                |   1 +
>  drivers/gpu/drm/xe/xe_hw_engine.c          |   1 +
>  drivers/gpu/drm/xe/xe_lmtt.c               |   1 +
>  drivers/gpu/drm/xe/xe_lrc.c                |   3 +
>  drivers/gpu/drm/xe/xe_memirq.c             |   1 +
>  drivers/gpu/drm/xe/xe_migrate.c            |   1 +
>  drivers/gpu/drm/xe/xe_uc_fw.c              |   1 +
>  14 files changed, 108 insertions(+), 34 deletions(-)
> 
> diff --git a/drivers/gpu/drm/xe/xe_bo.c b/drivers/gpu/drm/xe/xe_bo.c
> index 6c8fd5ced2a2..4c48e7ef1e1f 100644
> --- a/drivers/gpu/drm/xe/xe_bo.c
> +++ b/drivers/gpu/drm/xe/xe_bo.c
> @@ -773,15 +773,12 @@ static int xe_bo_move(struct ttm_buffer_object *ttm_bo, bool evict,
>  
>  	if (bo->flags & XE_BO_FLAG_PINNED_WONTNEED) {
>  		ttm_bo_move_null(&bo->ttm, new_mem);
> -	} else if (xe_bo_is_pinned(bo) && !xe_bo_is_user(bo)) {
> +	} else if (xe_bo_is_pinned(bo) && bo->flags & XE_BO_FLAG_PINNED_NEED_LOAD) {
>  		/*
>  		 * Kernel memory that is pinned should only be moved on suspend
>  		 * / resume, some of the pinned memory is required for the
>  		 * device to resume / use the GPU to move other evicted memory
> -		 * (user memory) around. This likely could be optimized a bit
> -		 * futher where we find the minimum set of pinned memory
> -		 * required for resume but for simplity doing a memcpy for all
> -		 * pinned memory.
> +		 * (user memory, pinned kernel not required for load) around.
>  		 */
>  		ret = xe_bo_vmap(bo);
>  		if (!ret) {
> @@ -1706,7 +1703,8 @@ int xe_managed_bo_reinit_in_vram(struct xe_device *xe, struct xe_tile *tile, str
>  	u32 dst_flags = XE_BO_FLAG_VRAM_IF_DGFX(tile) | XE_BO_FLAG_GGTT;
>  
>  	dst_flags |= (*src)->flags & (XE_BO_FLAG_GGTT_INVALIDATE |
> -				      XE_BO_FLAG_PINNED_WONTNEED);
> +				      XE_BO_FLAG_PINNED_WONTNEED |
> +				      XE_BO_FLAG_PINNED_NEED_LOAD);
>  
>  	xe_assert(xe, IS_DGFX(xe));
>  	xe_assert(xe, !(*src)->vmap.is_iomem);
> diff --git a/drivers/gpu/drm/xe/xe_bo.h b/drivers/gpu/drm/xe/xe_bo.h
> index 854ab8624d7a..cbda820c9d79 100644
> --- a/drivers/gpu/drm/xe/xe_bo.h
> +++ b/drivers/gpu/drm/xe/xe_bo.h
> @@ -40,6 +40,7 @@
>  #define XE_BO_FLAG_NEEDS_2M		BIT(16)
>  #define XE_BO_FLAG_GGTT_INVALIDATE	BIT(17)
>  #define XE_BO_FLAG_PINNED_WONTNEED	BIT(18)
> +#define XE_BO_FLAG_PINNED_NEED_LOAD	BIT(19)
>  /* this one is trigger internally only */
>  #define XE_BO_FLAG_INTERNAL_TEST	BIT(30)
>  #define XE_BO_FLAG_INTERNAL_64K		BIT(31)
> diff --git a/drivers/gpu/drm/xe/xe_bo_evict.c b/drivers/gpu/drm/xe/xe_bo_evict.c
> index 32043e1e5a86..049868610e0b 100644
> --- a/drivers/gpu/drm/xe/xe_bo_evict.c
> +++ b/drivers/gpu/drm/xe/xe_bo_evict.c
> @@ -78,6 +78,37 @@ int xe_bo_evict_all(struct xe_device *xe)
>  	list_splice_tail(&still_in_list, &xe->pinned.external_vram);
>  	spin_unlock(&xe->pinned.lock);
>  
> +	/* Kernel memory with GPU copy */
> +	INIT_LIST_HEAD(&still_in_list);
> +	spin_lock(&xe->pinned.lock);
> +	for (;;) {
> +		bo = list_first_entry_or_null(&xe->pinned.kernel_bo_present,
> +					      typeof(*bo), pinned_link);
> +		if (!bo)
> +			break;
> +
> +		if (bo->flags & (XE_BO_FLAG_PINNED_NEED_LOAD |
> +				 XE_BO_FLAG_PINNED_WONTNEED)) {
> +			list_move_tail(&bo->pinned_link, &still_in_list);
> +			continue;
> +		}
> +
> +		xe_bo_get(bo);
> +		list_move_tail(&bo->pinned_link, &xe->pinned.evicted);
> +		spin_unlock(&xe->pinned.lock);
> +
> +		xe_bo_lock(bo, false);
> +		ret = xe_bo_evict_pinned(bo);
> +		xe_bo_unlock(bo);
> +		xe_bo_put(bo);
> +		if (ret)
> +			return ret;
> +
> +		spin_lock(&xe->pinned.lock);
> +	}
> +	list_splice_tail(&still_in_list, &xe->pinned.kernel_bo_present);
> +	spin_unlock(&xe->pinned.lock);
> +
>  	/*
>  	 * Wait for all user BO to be evicted as those evictions depend on the
>  	 * memory moved below.
> @@ -109,6 +140,43 @@ int xe_bo_evict_all(struct xe_device *xe)
>  	return 0;
>  }
>  
> +static int do_restore_kernel(struct xe_device *xe, struct xe_bo *bo)
> +{
> +	int ret;
> +
> +	xe_bo_get(bo);
> +	list_move_tail(&bo->pinned_link, &xe->pinned.kernel_bo_present);
> +	spin_unlock(&xe->pinned.lock);
> +
> +	xe_bo_lock(bo, false);
> +	ret = xe_bo_restore_pinned(bo);
> +	xe_bo_unlock(bo);
> +	if (ret) {
> +		xe_bo_put(bo);
> +		return ret;
> +	}
> +
> +	if (bo->flags & XE_BO_FLAG_GGTT) {
> +		struct xe_tile *tile = bo->tile;
> +
> +		mutex_lock(&tile->mem.ggtt->lock);
> +		xe_ggtt_map_bo(tile->mem.ggtt, bo);
> +		mutex_unlock(&tile->mem.ggtt->lock);
> +	}
> +
> +	/*
> +	 * We expect validate to trigger a move VRAM and our move code
> +	 * should setup the iosys map.
> +	 */
> +	xe_assert(xe, !iosys_map_is_null(&bo->vmap));
> +
> +	xe_bo_put(bo);
> +
> +	spin_lock(&xe->pinned.lock);
> +
> +	return 0;
> +}
> +
>  /**
>   * xe_bo_restore_kernel - restore kernel BOs to VRAM
>   *
> @@ -122,48 +190,32 @@ int xe_bo_evict_all(struct xe_device *xe)
>   */
>  int xe_bo_restore_kernel(struct xe_device *xe)
>  {
> +	struct list_head still_in_list;
>  	struct xe_bo *bo;
>  	int ret;
>  
>  	if (!IS_DGFX(xe))
>  		return 0;
>  
> +	INIT_LIST_HEAD(&still_in_list);
>  	spin_lock(&xe->pinned.lock);
>  	for (;;) {
>  		bo = list_first_entry_or_null(&xe->pinned.evicted,
>  					      typeof(*bo), pinned_link);
>  		if (!bo)
>  			break;
> -		xe_bo_get(bo);
> -		list_move_tail(&bo->pinned_link, &xe->pinned.kernel_bo_present);
> -		spin_unlock(&xe->pinned.lock);
>  
> -		xe_bo_lock(bo, false);
> -		ret = xe_bo_restore_pinned(bo);
> -		xe_bo_unlock(bo);
> -		if (ret) {
> -			xe_bo_put(bo);
> -			return ret;
> +		if (!(bo->flags & (XE_BO_FLAG_PINNED_NEED_LOAD |
> +				   XE_BO_FLAG_PINNED_WONTNEED))) {
> +			list_move_tail(&bo->pinned_link, &still_in_list);
> +			continue;
>  		}
>  
> -		if (bo->flags & XE_BO_FLAG_GGTT) {
> -			struct xe_tile *tile = bo->tile;
> -
> -			mutex_lock(&tile->mem.ggtt->lock);
> -			xe_ggtt_map_bo(tile->mem.ggtt, bo);
> -			mutex_unlock(&tile->mem.ggtt->lock);
> -		}
> -
> -		/*
> -		 * We expect validate to trigger a move VRAM and our move code
> -		 * should setup the iosys map.
> -		 */
> -		xe_assert(xe, !iosys_map_is_null(&bo->vmap));
> -
> -		xe_bo_put(bo);
> -
> -		spin_lock(&xe->pinned.lock);
> +		ret = do_restore_kernel(xe, bo);
> +		if (ret)
> +			return ret;
>  	}
> +	list_splice_tail(&still_in_list, &xe->pinned.evicted);
>  	spin_unlock(&xe->pinned.lock);
>  
>  	return 0;
> @@ -190,9 +242,20 @@ int xe_bo_restore_user(struct xe_device *xe)
>  	if (!IS_DGFX(xe))
>  		return 0;
>  
> -	/* Pinned user memory in VRAM should be validated on resume */
>  	INIT_LIST_HEAD(&still_in_list);
>  	spin_lock(&xe->pinned.lock);
> +	/* Pinned kernel memory with GPU copy */
> +	for (;;) {
> +		bo = list_first_entry_or_null(&xe->pinned.evicted,
> +					      typeof(*bo), pinned_link);
> +		if (!bo)
> +			break;
> +
> +		ret = do_restore_kernel(xe, bo);
> +		if (ret)
> +			return ret;
> +	}
> +	/* Pinned user memory in VRAM should be validated on resume */
>  	for (;;) {
>  		bo = list_first_entry_or_null(&xe->pinned.external_vram,
>  					      typeof(*bo), pinned_link);
> diff --git a/drivers/gpu/drm/xe/xe_ggtt.c b/drivers/gpu/drm/xe/xe_ggtt.c
> index 0124ad120c04..d42dbda983c1 100644
> --- a/drivers/gpu/drm/xe/xe_ggtt.c
> +++ b/drivers/gpu/drm/xe/xe_ggtt.c
> @@ -357,7 +357,7 @@ void xe_ggtt_node_remove(struct xe_ggtt_node *node, bool invalidate)
>  int xe_ggtt_init(struct xe_ggtt *ggtt)
>  {
>  	struct xe_device *xe = tile_to_xe(ggtt->tile);
> -	unsigned int flags;
> +	unsigned int flags = XE_BO_FLAG_PINNED_NEED_LOAD;
>  	int err;
>  
>  	/*
> diff --git a/drivers/gpu/drm/xe/xe_gsc.c b/drivers/gpu/drm/xe/xe_gsc.c
> index 1eb791ddc375..bbb68575a207 100644
> --- a/drivers/gpu/drm/xe/xe_gsc.c
> +++ b/drivers/gpu/drm/xe/xe_gsc.c
> @@ -474,6 +474,7 @@ int xe_gsc_init_post_hwconfig(struct xe_gsc *gsc)
>  		return -ENODEV;
>  
>  	bo = xe_managed_bo_create_pin_map(xe, tile, SZ_4M,
> +					  XE_BO_FLAG_PINNED_NEED_LOAD |
>  					  XE_BO_FLAG_STOLEN |
>  					  XE_BO_FLAG_GGTT);
>  	if (IS_ERR(bo))
> diff --git a/drivers/gpu/drm/xe/xe_gsc_proxy.c b/drivers/gpu/drm/xe/xe_gsc_proxy.c
> index fc64b45d324b..df271f9b657d 100644
> --- a/drivers/gpu/drm/xe/xe_gsc_proxy.c
> +++ b/drivers/gpu/drm/xe/xe_gsc_proxy.c
> @@ -384,6 +384,7 @@ static int proxy_channel_alloc(struct xe_gsc *gsc)
>  		return -ENOMEM;
>  
>  	bo = xe_managed_bo_create_pin_map(xe, tile, GSC_PROXY_CHANNEL_SIZE,
> +					  XE_BO_FLAG_PINNED_NEED_LOAD |
>  					  XE_BO_FLAG_SYSTEM |
>  					  XE_BO_FLAG_GGTT);
>  	if (IS_ERR(bo))
> diff --git a/drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c b/drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c
> index 062a0c2fd2cd..75b225cbddf8 100644
> --- a/drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c
> +++ b/drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c
> @@ -1405,6 +1405,7 @@ static int pf_provision_vf_lmem(struct xe_gt *gt, unsigned int vfid, u64 size)
>  	bo = xe_bo_create_pin_map(xe, tile, NULL,
>  				  ALIGN(size, PAGE_SIZE),
>  				  ttm_bo_type_kernel,
> +				  XE_BO_FLAG_PINNED_NEED_LOAD |
>  				  XE_BO_FLAG_VRAM_IF_DGFX(tile) |
>  				  XE_BO_FLAG_NEEDS_2M |
>  				  XE_BO_FLAG_PINNED);
> diff --git a/drivers/gpu/drm/xe/xe_huc.c b/drivers/gpu/drm/xe/xe_huc.c
> index 6a846e4cb221..1008626b517e 100644
> --- a/drivers/gpu/drm/xe/xe_huc.c
> +++ b/drivers/gpu/drm/xe/xe_huc.c
> @@ -53,6 +53,7 @@ static int huc_alloc_gsc_pkt(struct xe_huc *huc)
>  	/* we use a single object for both input and output */
>  	bo = xe_managed_bo_create_pin_map(xe, gt_to_tile(gt),
>  					  PXP43_HUC_AUTH_INOUT_SIZE * 2,
> +					  XE_BO_FLAG_PINNED_NEED_LOAD |
>  					  XE_BO_FLAG_SYSTEM |
>  					  XE_BO_FLAG_GGTT);
>  	if (IS_ERR(bo))
> diff --git a/drivers/gpu/drm/xe/xe_hw_engine.c b/drivers/gpu/drm/xe/xe_hw_engine.c
> index 1557acee3523..46e4342cc9cd 100644
> --- a/drivers/gpu/drm/xe/xe_hw_engine.c
> +++ b/drivers/gpu/drm/xe/xe_hw_engine.c
> @@ -577,6 +577,7 @@ static int hw_engine_init(struct xe_gt *gt, struct xe_hw_engine *hwe,
>  	xe_reg_sr_apply_whitelist(hwe);
>  
>  	hwe->hwsp = xe_managed_bo_create_pin_map(xe, tile, SZ_4K,
> +						 XE_BO_FLAG_PINNED_NEED_LOAD |
>  						 XE_BO_FLAG_VRAM_IF_DGFX(tile) |
>  						 XE_BO_FLAG_GGTT |
>  						 XE_BO_FLAG_GGTT_INVALIDATE);
> diff --git a/drivers/gpu/drm/xe/xe_lmtt.c b/drivers/gpu/drm/xe/xe_lmtt.c
> index a60ceae4c6dd..c3e76ddf24be 100644
> --- a/drivers/gpu/drm/xe/xe_lmtt.c
> +++ b/drivers/gpu/drm/xe/xe_lmtt.c
> @@ -70,6 +70,7 @@ static struct xe_lmtt_pt *lmtt_pt_alloc(struct xe_lmtt *lmtt, unsigned int level
>  				  PAGE_ALIGN(lmtt->ops->lmtt_pte_size(level) *
>  					     lmtt->ops->lmtt_pte_num(level)),
>  				  ttm_bo_type_kernel,
> +				  XE_BO_FLAG_PINNED_NEED_LOAD |
>  				  XE_BO_FLAG_VRAM_IF_DGFX(lmtt_to_tile(lmtt)) |
>  				  XE_BO_FLAG_NEEDS_64K | XE_BO_FLAG_PINNED);
>  	if (IS_ERR(bo)) {
> diff --git a/drivers/gpu/drm/xe/xe_lrc.c b/drivers/gpu/drm/xe/xe_lrc.c
> index 4f64c7f4e68d..fec5aa700ce1 100644
> --- a/drivers/gpu/drm/xe/xe_lrc.c
> +++ b/drivers/gpu/drm/xe/xe_lrc.c
> @@ -884,6 +884,8 @@ static int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
>  	void *init_data = NULL;
>  	u32 arb_enable;
>  	u32 lrc_size;
> +	unsigned long restore_flags = vm && vm->flags & XE_VM_FLAG_MIGRATION ?
> +		XE_BO_FLAG_PINNED_NEED_LOAD : 0;
>  	int err;
>  
>  	kref_init(&lrc->refcount);
> @@ -898,6 +900,7 @@ static int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
>  	 */
>  	lrc->bo = xe_bo_create_pin_map(xe, tile, vm, lrc_size,
>  				       ttm_bo_type_kernel,
> +				       restore_flags |
>  				       XE_BO_FLAG_VRAM_IF_DGFX(tile) |
>  				       XE_BO_FLAG_GGTT |
>  				       XE_BO_FLAG_GGTT_INVALIDATE);
> diff --git a/drivers/gpu/drm/xe/xe_memirq.c b/drivers/gpu/drm/xe/xe_memirq.c
> index f833da88150a..6e63a7c4206e 100644
> --- a/drivers/gpu/drm/xe/xe_memirq.c
> +++ b/drivers/gpu/drm/xe/xe_memirq.c
> @@ -187,6 +187,7 @@ static int memirq_alloc_pages(struct xe_memirq *memirq)
>  	/* XXX: convert to managed bo */
>  	bo = xe_bo_create_pin_map(xe, tile, NULL, bo_size,
>  				  ttm_bo_type_kernel,
> +				  XE_BO_FLAG_PINNED_NEED_LOAD |
>  				  XE_BO_FLAG_SYSTEM |
>  				  XE_BO_FLAG_GGTT |
>  				  XE_BO_FLAG_GGTT_INVALIDATE |
> diff --git a/drivers/gpu/drm/xe/xe_migrate.c b/drivers/gpu/drm/xe/xe_migrate.c
> index cfd31ae49cc1..cf8ad0c47dfb 100644
> --- a/drivers/gpu/drm/xe/xe_migrate.c
> +++ b/drivers/gpu/drm/xe/xe_migrate.c
> @@ -208,6 +208,7 @@ static int xe_migrate_prepare_vm(struct xe_tile *tile, struct xe_migrate *m,
>  	bo = xe_bo_create_pin_map(vm->xe, tile, vm,
>  				  num_entries * XE_PAGE_SIZE,
>  				  ttm_bo_type_kernel,
> +				  XE_BO_FLAG_PINNED_NEED_LOAD |
>  				  XE_BO_FLAG_VRAM_IF_DGFX(tile) |
>  				  XE_BO_FLAG_PINNED);
>  	if (IS_ERR(bo))
> diff --git a/drivers/gpu/drm/xe/xe_uc_fw.c b/drivers/gpu/drm/xe/xe_uc_fw.c
> index b0deb082ab2b..984b0e0a6241 100644
> --- a/drivers/gpu/drm/xe/xe_uc_fw.c
> +++ b/drivers/gpu/drm/xe/xe_uc_fw.c
> @@ -793,6 +793,7 @@ int xe_uc_fw_init(struct xe_uc_fw *uc_fw)
>  		return 0;
>  
>  	err = uc_fw_copy(uc_fw, fw->data, fw->size,
> +			 XE_BO_FLAG_PINNED_NEED_LOAD |
>  			 XE_BO_FLAG_SYSTEM | XE_BO_FLAG_GGTT |
>  			 XE_BO_FLAG_GGTT_INVALIDATE);
>  
> -- 
> 2.34.1
> 


More information about the Intel-xe mailing list