[PATCH v3 2/5] drm/amdgpu: Throttle visible VRAM moves separately

Fri Jul 7 22:23:27 UTC 2017

On Thu, Jul 06, 2017 at 07:51:27PM +0900, Michel Dänzer wrote:
> From: John Brooks <john at fastquake.com>
> 
> The BO move throttling code is designed to allow VRAM to fill quickly if it
> is relatively empty. However, this does not take into account situations
> where the visible VRAM is smaller than total VRAM, and total VRAM may not
> be close to full but the visible VRAM segment is under pressure. In such
> situations, visible VRAM would experience unrestricted swapping and
> performance would drop.
> 
> Add a separate counter specifically for moves involving visible VRAM, and
> check it before moving BOs there.
> 
> v2: Only perform calculations for separate counter if visible VRAM is
>     smaller than total VRAM. (Michel Dänzer)
> v3: [Michel Dänzer]
> * Use BO's location rather than the AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED
>   flag to determine whether to account a move for visible VRAM in most
>   cases.
> * Use a single
> 
> 	if (adev->mc.visible_vram_size < adev->mc.real_vram_size) {
> 
>   block in amdgpu_cs_get_threshold_for_moves.
> 
> Fixes: 95844d20ae02 (drm/amdgpu: throttle buffer migrations at CS using a fixed MBps limit (v2))
> Signed-off-by: John Brooks <john at fastquake.com>
> Signed-off-by: Michel Dänzer <michel.daenzer at amd.com>

Changes look good. You can have my

Reviewed-by: John Brooks <john at fastquake.com>

if you want it.

> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  6 +-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c     | 92 ++++++++++++++++++++++++------
>  drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 12 +++-
>  3 files changed, 87 insertions(+), 23 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index b95c1074d42c..463d6c241157 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -1162,7 +1162,9 @@ struct amdgpu_cs_parser {
>  	struct list_head		validated;
>  	struct dma_fence		*fence;
>  	uint64_t			bytes_moved_threshold;
> +	uint64_t			bytes_moved_vis_threshold;
>  	uint64_t			bytes_moved;
> +	uint64_t			bytes_moved_vis;
>  	struct amdgpu_bo_list_entry	*evictable;
>  
>  	/* user fence */
> @@ -1596,6 +1598,7 @@ struct amdgpu_device {
>  		spinlock_t		lock;
>  		s64			last_update_us;
>  		s64			accum_us; /* accumulated microseconds */
> +		s64			accum_us_vis; /* for visible VRAM */
>  		u32			log2_max_MBps;
>  	} mm_stats;
>  
> @@ -1892,7 +1895,8 @@ void amdgpu_pci_config_reset(struct amdgpu_device *adev);
>  bool amdgpu_need_post(struct amdgpu_device *adev);
>  void amdgpu_update_display_priority(struct amdgpu_device *adev);
>  
> -void amdgpu_cs_report_moved_bytes(struct amdgpu_device *adev, u64 num_bytes);
> +void amdgpu_cs_report_moved_bytes(struct amdgpu_device *adev, u64 num_bytes,
> +				  u64 num_vis_bytes);
>  void amdgpu_ttm_placement_from_domain(struct amdgpu_bo *abo, u32 domain);
>  bool amdgpu_ttm_bo_is_amdgpu_bo(struct ttm_buffer_object *bo);
>  int amdgpu_ttm_tt_get_user_pages(struct ttm_tt *ttm, struct page **pages);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> index 82131d70a06b..44ec11d4d733 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> @@ -220,10 +220,11 @@ static s64 bytes_to_us(struct amdgpu_device *adev, u64 bytes)
>   * ticks. The accumulated microseconds (us) are converted to bytes and
>   * returned.
>   */
> -static u64 amdgpu_cs_get_threshold_for_moves(struct amdgpu_device *adev)
> +static void amdgpu_cs_get_threshold_for_moves(struct amdgpu_device *adev,
> +					      u64 *max_bytes,
> +					      u64 *max_vis_bytes)
>  {
>  	s64 time_us, increment_us;
> -	u64 max_bytes;
>  	u64 free_vram, total_vram, used_vram;
>  
>  	/* Allow a maximum of 200 accumulated ms. This is basically per-IB
> @@ -235,8 +236,11 @@ static u64 amdgpu_cs_get_threshold_for_moves(struct amdgpu_device *adev)
>  	 */
>  	const s64 us_upper_bound = 200000;
>  
> -	if (!adev->mm_stats.log2_max_MBps)
> -		return 0;
> +	if (!adev->mm_stats.log2_max_MBps) {
> +		*max_bytes = 0;
> +		*max_vis_bytes = 0;
> +		return;
> +	}
>  
>  	total_vram = adev->mc.real_vram_size - adev->vram_pin_size;
>  	used_vram = atomic64_read(&adev->vram_usage);
> @@ -277,23 +281,45 @@ static u64 amdgpu_cs_get_threshold_for_moves(struct amdgpu_device *adev)
>  		adev->mm_stats.accum_us = max(min_us, adev->mm_stats.accum_us);
>  	}
>  
> -	/* This returns 0 if the driver is in debt to disallow (optional)
> +	/* This is set to 0 if the driver is in debt to disallow (optional)
>  	 * buffer moves.
>  	 */
> -	max_bytes = us_to_bytes(adev, adev->mm_stats.accum_us);
> +	*max_bytes = us_to_bytes(adev, adev->mm_stats.accum_us);
> +
> +	/* Do the same for visible VRAM if half of it is free */
> +	if (adev->mc.visible_vram_size < adev->mc.real_vram_size) {
> +		u64 total_vis_vram = adev->mc.visible_vram_size;
> +		u64 used_vis_vram = atomic64_read(&adev->vram_vis_usage);
> +
> +		if (used_vis_vram < total_vis_vram) {
> +			u64 free_vis_vram = total_vis_vram - used_vis_vram;
> +			adev->mm_stats.accum_us_vis = min(adev->mm_stats.accum_us_vis +
> +							  increment_us, us_upper_bound);
> +
> +			if (free_vis_vram >= total_vis_vram / 2)
> +				adev->mm_stats.accum_us_vis =
> +					max(bytes_to_us(adev, free_vis_vram / 2),
> +					    adev->mm_stats.accum_us_vis);
> +		}
> +
> +		*max_vis_bytes = us_to_bytes(adev, adev->mm_stats.accum_us_vis);
> +	} else {
> +		*max_vis_bytes = 0;
> +	}
>  
>  	spin_unlock(&adev->mm_stats.lock);
> -	return max_bytes;
>  }
>  
>  /* Report how many bytes have really been moved for the last command
>   * submission. This can result in a debt that can stop buffer migrations
>   * temporarily.
>   */
> -void amdgpu_cs_report_moved_bytes(struct amdgpu_device *adev, u64 num_bytes)
> +void amdgpu_cs_report_moved_bytes(struct amdgpu_device *adev, u64 num_bytes,
> +				  u64 num_vis_bytes)
>  {
>  	spin_lock(&adev->mm_stats.lock);
>  	adev->mm_stats.accum_us -= bytes_to_us(adev, num_bytes);
> +	adev->mm_stats.accum_us_vis -= bytes_to_us(adev, num_vis_bytes);
>  	spin_unlock(&adev->mm_stats.lock);
>  }
>  
> @@ -301,7 +327,7 @@ static int amdgpu_cs_bo_validate(struct amdgpu_cs_parser *p,
>  				 struct amdgpu_bo *bo)
>  {
>  	struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev);
> -	u64 initial_bytes_moved;
> +	u64 initial_bytes_moved, bytes_moved;
>  	uint32_t domain;
>  	int r;
>  
> @@ -311,17 +337,35 @@ static int amdgpu_cs_bo_validate(struct amdgpu_cs_parser *p,
>  	/* Don't move this buffer if we have depleted our allowance
>  	 * to move it. Don't move anything if the threshold is zero.
>  	 */
> -	if (p->bytes_moved < p->bytes_moved_threshold)
> -		domain = bo->prefered_domains;
> -	else
> +	if (p->bytes_moved < p->bytes_moved_threshold) {
> +		if (adev->mc.visible_vram_size < adev->mc.real_vram_size &&
> +		    (bo->flags & AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED)) {
> +			/* And don't move a CPU_ACCESS_REQUIRED BO to limited
> +			 * visible VRAM if we've depleted our allowance to do
> +			 * that.
> +			 */
> +			if (p->bytes_moved_vis < p->bytes_moved_vis_threshold)
> +				domain = bo->prefered_domains;
> +			else
> +				domain = bo->allowed_domains;
> +		} else {
> +			domain = bo->prefered_domains;
> +		}
> +	} else {
>  		domain = bo->allowed_domains;
> +	}
>  
>  retry:
>  	amdgpu_ttm_placement_from_domain(bo, domain);
>  	initial_bytes_moved = atomic64_read(&adev->num_bytes_moved);
>  	r = ttm_bo_validate(&bo->tbo, &bo->placement, true, false);
> -	p->bytes_moved += atomic64_read(&adev->num_bytes_moved) -
> -		initial_bytes_moved;
> +	bytes_moved = atomic64_read(&adev->num_bytes_moved) -
> +		      initial_bytes_moved;
> +	p->bytes_moved += bytes_moved;
> +	if (adev->mc.visible_vram_size < adev->mc.real_vram_size &&
> +	    bo->tbo.mem.mem_type == TTM_PL_VRAM &&
> +	    bo->tbo.mem.start < adev->mc.visible_vram_size >> PAGE_SHIFT)
> +		p->bytes_moved_vis += bytes_moved;
>  
>  	if (unlikely(r == -ENOMEM) && domain != bo->allowed_domains) {
>  		domain = bo->allowed_domains;
> @@ -347,7 +391,8 @@ static bool amdgpu_cs_try_evict(struct amdgpu_cs_parser *p,
>  		struct amdgpu_bo_list_entry *candidate = p->evictable;
>  		struct amdgpu_bo *bo = candidate->robj;
>  		struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev);
> -		u64 initial_bytes_moved;
> +		u64 initial_bytes_moved, bytes_moved;
> +		bool update_bytes_moved_vis;
>  		uint32_t other;
>  
>  		/* If we reached our current BO we can forget it */
> @@ -367,10 +412,17 @@ static bool amdgpu_cs_try_evict(struct amdgpu_cs_parser *p,
>  
>  		/* Good we can try to move this BO somewhere else */
>  		amdgpu_ttm_placement_from_domain(bo, other);
> +		update_bytes_moved_vis =
> +			adev->mc.visible_vram_size < adev->mc.real_vram_size &&
> +			bo->tbo.mem.mem_type == TTM_PL_VRAM &&
> +			bo->tbo.mem.start < adev->mc.visible_vram_size >> PAGE_SHIFT;
>  		initial_bytes_moved = atomic64_read(&adev->num_bytes_moved);
>  		r = ttm_bo_validate(&bo->tbo, &bo->placement, true, false);
> -		p->bytes_moved += atomic64_read(&adev->num_bytes_moved) -
> +		bytes_moved = atomic64_read(&adev->num_bytes_moved) -
>  			initial_bytes_moved;
> +		p->bytes_moved += bytes_moved;
> +		if (update_bytes_moved_vis)
> +			p->bytes_moved_vis += bytes_moved;
>  
>  		if (unlikely(r))
>  			break;
> @@ -550,8 +602,10 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
>  		list_splice(&need_pages, &p->validated);
>  	}
>  
> -	p->bytes_moved_threshold = amdgpu_cs_get_threshold_for_moves(p->adev);
> +	amdgpu_cs_get_threshold_for_moves(p->adev, &p->bytes_moved_threshold,
> +					  &p->bytes_moved_vis_threshold);
>  	p->bytes_moved = 0;
> +	p->bytes_moved_vis = 0;
>  	p->evictable = list_last_entry(&p->validated,
>  				       struct amdgpu_bo_list_entry,
>  				       tv.head);
> @@ -575,8 +629,8 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
>  		goto error_validate;
>  	}
>  
> -	amdgpu_cs_report_moved_bytes(p->adev, p->bytes_moved);
> -
> +	amdgpu_cs_report_moved_bytes(p->adev, p->bytes_moved,
> +				     p->bytes_moved_vis);
>  	fpriv->vm.last_eviction_counter =
>  		atomic64_read(&p->adev->num_evictions);
>  
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> index a85e75327456..e429829ae93d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> @@ -322,7 +322,7 @@ int amdgpu_bo_create_restricted(struct amdgpu_device *adev,
>  	struct amdgpu_bo *bo;
>  	enum ttm_bo_type type;
>  	unsigned long page_align;
> -	u64 initial_bytes_moved;
> +	u64 initial_bytes_moved, bytes_moved;
>  	size_t acc_size;
>  	int r;
>  
> @@ -398,8 +398,14 @@ int amdgpu_bo_create_restricted(struct amdgpu_device *adev,
>  	r = ttm_bo_init_reserved(&adev->mman.bdev, &bo->tbo, size, type,
>  				 &bo->placement, page_align, !kernel, NULL,
>  				 acc_size, sg, resv, &amdgpu_ttm_bo_destroy);
> -	amdgpu_cs_report_moved_bytes(adev,
> -		atomic64_read(&adev->num_bytes_moved) - initial_bytes_moved);
> +	bytes_moved = atomic64_read(&adev->num_bytes_moved) -
> +		      initial_bytes_moved;
> +	if (adev->mc.visible_vram_size < adev->mc.real_vram_size &&
> +	    bo->tbo.mem.mem_type == TTM_PL_VRAM &&
> +	    bo->tbo.mem.start < adev->mc.visible_vram_size >> PAGE_SHIFT)
> +		amdgpu_cs_report_moved_bytes(adev, bytes_moved, bytes_moved);
> +	else
> +		amdgpu_cs_report_moved_bytes(adev, bytes_moved, 0);
>  
>  	if (unlikely(r != 0))
>  		return r;
> -- 
> 2.13.2
>