[PATCH v3 2/5] drm/amdgpu: Throttle visible VRAM moves separately

Thu Jul 6 11:38:15 UTC 2017

Am 06.07.2017 um 12:51 schrieb Michel Dänzer:
> From: John Brooks <john at fastquake.com>
>
> The BO move throttling code is designed to allow VRAM to fill quickly if it
> is relatively empty. However, this does not take into account situations
> where the visible VRAM is smaller than total VRAM, and total VRAM may not
> be close to full but the visible VRAM segment is under pressure. In such
> situations, visible VRAM would experience unrestricted swapping and
> performance would drop.
>
> Add a separate counter specifically for moves involving visible VRAM, and
> check it before moving BOs there.
>
> v2: Only perform calculations for separate counter if visible VRAM is
>      smaller than total VRAM. (Michel Dänzer)
> v3: [Michel Dänzer]
> * Use BO's location rather than the AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED
>    flag to determine whether to account a move for visible VRAM in most
>    cases.
> * Use a single
>
> 	if (adev->mc.visible_vram_size < adev->mc.real_vram_size) {
>
>    block in amdgpu_cs_get_threshold_for_moves.
>
> Fixes: 95844d20ae02 (drm/amdgpu: throttle buffer migrations at CS using a fixed MBps limit (v2))
> Signed-off-by: John Brooks <john at fastquake.com>
> Signed-off-by: Michel Dänzer <michel.daenzer at amd.com>

Reviewed-by: Christian König <christian.koenig at amd.com>

> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  6 +-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c     | 92 ++++++++++++++++++++++++------
>   drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 12 +++-
>   3 files changed, 87 insertions(+), 23 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index b95c1074d42c..463d6c241157 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -1162,7 +1162,9 @@ struct amdgpu_cs_parser {
>   	struct list_head		validated;
>   	struct dma_fence		*fence;
>   	uint64_t			bytes_moved_threshold;
> +	uint64_t			bytes_moved_vis_threshold;
>   	uint64_t			bytes_moved;
> +	uint64_t			bytes_moved_vis;
>   	struct amdgpu_bo_list_entry	*evictable;
>   
>   	/* user fence */
> @@ -1596,6 +1598,7 @@ struct amdgpu_device {
>   		spinlock_t		lock;
>   		s64			last_update_us;
>   		s64			accum_us; /* accumulated microseconds */
> +		s64			accum_us_vis; /* for visible VRAM */
>   		u32			log2_max_MBps;
>   	} mm_stats;
>   
> @@ -1892,7 +1895,8 @@ void amdgpu_pci_config_reset(struct amdgpu_device *adev);
>   bool amdgpu_need_post(struct amdgpu_device *adev);
>   void amdgpu_update_display_priority(struct amdgpu_device *adev);
>   
> -void amdgpu_cs_report_moved_bytes(struct amdgpu_device *adev, u64 num_bytes);
> +void amdgpu_cs_report_moved_bytes(struct amdgpu_device *adev, u64 num_bytes,
> +				  u64 num_vis_bytes);
>   void amdgpu_ttm_placement_from_domain(struct amdgpu_bo *abo, u32 domain);
>   bool amdgpu_ttm_bo_is_amdgpu_bo(struct ttm_buffer_object *bo);
>   int amdgpu_ttm_tt_get_user_pages(struct ttm_tt *ttm, struct page **pages);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> index 82131d70a06b..44ec11d4d733 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> @@ -220,10 +220,11 @@ static s64 bytes_to_us(struct amdgpu_device *adev, u64 bytes)
>    * ticks. The accumulated microseconds (us) are converted to bytes and
>    * returned.
>    */
> -static u64 amdgpu_cs_get_threshold_for_moves(struct amdgpu_device *adev)
> +static void amdgpu_cs_get_threshold_for_moves(struct amdgpu_device *adev,
> +					      u64 *max_bytes,
> +					      u64 *max_vis_bytes)
>   {
>   	s64 time_us, increment_us;
> -	u64 max_bytes;
>   	u64 free_vram, total_vram, used_vram;
>   
>   	/* Allow a maximum of 200 accumulated ms. This is basically per-IB
> @@ -235,8 +236,11 @@ static u64 amdgpu_cs_get_threshold_for_moves(struct amdgpu_device *adev)
>   	 */
>   	const s64 us_upper_bound = 200000;
>   
> -	if (!adev->mm_stats.log2_max_MBps)
> -		return 0;
> +	if (!adev->mm_stats.log2_max_MBps) {
> +		*max_bytes = 0;
> +		*max_vis_bytes = 0;
> +		return;
> +	}
>   
>   	total_vram = adev->mc.real_vram_size - adev->vram_pin_size;
>   	used_vram = atomic64_read(&adev->vram_usage);
> @@ -277,23 +281,45 @@ static u64 amdgpu_cs_get_threshold_for_moves(struct amdgpu_device *adev)
>   		adev->mm_stats.accum_us = max(min_us, adev->mm_stats.accum_us);
>   	}
>   
> -	/* This returns 0 if the driver is in debt to disallow (optional)
> +	/* This is set to 0 if the driver is in debt to disallow (optional)
>   	 * buffer moves.
>   	 */
> -	max_bytes = us_to_bytes(adev, adev->mm_stats.accum_us);
> +	*max_bytes = us_to_bytes(adev, adev->mm_stats.accum_us);
> +
> +	/* Do the same for visible VRAM if half of it is free */
> +	if (adev->mc.visible_vram_size < adev->mc.real_vram_size) {
> +		u64 total_vis_vram = adev->mc.visible_vram_size;
> +		u64 used_vis_vram = atomic64_read(&adev->vram_vis_usage);
> +
> +		if (used_vis_vram < total_vis_vram) {
> +			u64 free_vis_vram = total_vis_vram - used_vis_vram;
> +			adev->mm_stats.accum_us_vis = min(adev->mm_stats.accum_us_vis +
> +							  increment_us, us_upper_bound);
> +
> +			if (free_vis_vram >= total_vis_vram / 2)
> +				adev->mm_stats.accum_us_vis =
> +					max(bytes_to_us(adev, free_vis_vram / 2),
> +					    adev->mm_stats.accum_us_vis);
> +		}
> +
> +		*max_vis_bytes = us_to_bytes(adev, adev->mm_stats.accum_us_vis);
> +	} else {
> +		*max_vis_bytes = 0;
> +	}
>   
>   	spin_unlock(&adev->mm_stats.lock);
> -	return max_bytes;
>   }
>   
>   /* Report how many bytes have really been moved for the last command
>    * submission. This can result in a debt that can stop buffer migrations
>    * temporarily.
>    */
> -void amdgpu_cs_report_moved_bytes(struct amdgpu_device *adev, u64 num_bytes)
> +void amdgpu_cs_report_moved_bytes(struct amdgpu_device *adev, u64 num_bytes,
> +				  u64 num_vis_bytes)
>   {
>   	spin_lock(&adev->mm_stats.lock);
>   	adev->mm_stats.accum_us -= bytes_to_us(adev, num_bytes);
> +	adev->mm_stats.accum_us_vis -= bytes_to_us(adev, num_vis_bytes);
>   	spin_unlock(&adev->mm_stats.lock);
>   }
>   
> @@ -301,7 +327,7 @@ static int amdgpu_cs_bo_validate(struct amdgpu_cs_parser *p,
>   				 struct amdgpu_bo *bo)
>   {
>   	struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev);
> -	u64 initial_bytes_moved;
> +	u64 initial_bytes_moved, bytes_moved;
>   	uint32_t domain;
>   	int r;
>   
> @@ -311,17 +337,35 @@ static int amdgpu_cs_bo_validate(struct amdgpu_cs_parser *p,
>   	/* Don't move this buffer if we have depleted our allowance
>   	 * to move it. Don't move anything if the threshold is zero.
>   	 */
> -	if (p->bytes_moved < p->bytes_moved_threshold)
> -		domain = bo->prefered_domains;
> -	else
> +	if (p->bytes_moved < p->bytes_moved_threshold) {
> +		if (adev->mc.visible_vram_size < adev->mc.real_vram_size &&
> +		    (bo->flags & AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED)) {
> +			/* And don't move a CPU_ACCESS_REQUIRED BO to limited
> +			 * visible VRAM if we've depleted our allowance to do
> +			 * that.
> +			 */
> +			if (p->bytes_moved_vis < p->bytes_moved_vis_threshold)
> +				domain = bo->prefered_domains;
> +			else
> +				domain = bo->allowed_domains;
> +		} else {
> +			domain = bo->prefered_domains;
> +		}
> +	} else {
>   		domain = bo->allowed_domains;
> +	}
>   
>   retry:
>   	amdgpu_ttm_placement_from_domain(bo, domain);
>   	initial_bytes_moved = atomic64_read(&adev->num_bytes_moved);
>   	r = ttm_bo_validate(&bo->tbo, &bo->placement, true, false);
> -	p->bytes_moved += atomic64_read(&adev->num_bytes_moved) -
> -		initial_bytes_moved;
> +	bytes_moved = atomic64_read(&adev->num_bytes_moved) -
> +		      initial_bytes_moved;
> +	p->bytes_moved += bytes_moved;
> +	if (adev->mc.visible_vram_size < adev->mc.real_vram_size &&
> +	    bo->tbo.mem.mem_type == TTM_PL_VRAM &&
> +	    bo->tbo.mem.start < adev->mc.visible_vram_size >> PAGE_SHIFT)
> +		p->bytes_moved_vis += bytes_moved;
>   
>   	if (unlikely(r == -ENOMEM) && domain != bo->allowed_domains) {
>   		domain = bo->allowed_domains;
> @@ -347,7 +391,8 @@ static bool amdgpu_cs_try_evict(struct amdgpu_cs_parser *p,
>   		struct amdgpu_bo_list_entry *candidate = p->evictable;
>   		struct amdgpu_bo *bo = candidate->robj;
>   		struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev);
> -		u64 initial_bytes_moved;
> +		u64 initial_bytes_moved, bytes_moved;
> +		bool update_bytes_moved_vis;
>   		uint32_t other;
>   
>   		/* If we reached our current BO we can forget it */
> @@ -367,10 +412,17 @@ static bool amdgpu_cs_try_evict(struct amdgpu_cs_parser *p,
>   
>   		/* Good we can try to move this BO somewhere else */
>   		amdgpu_ttm_placement_from_domain(bo, other);
> +		update_bytes_moved_vis =
> +			adev->mc.visible_vram_size < adev->mc.real_vram_size &&
> +			bo->tbo.mem.mem_type == TTM_PL_VRAM &&
> +			bo->tbo.mem.start < adev->mc.visible_vram_size >> PAGE_SHIFT;
>   		initial_bytes_moved = atomic64_read(&adev->num_bytes_moved);
>   		r = ttm_bo_validate(&bo->tbo, &bo->placement, true, false);
> -		p->bytes_moved += atomic64_read(&adev->num_bytes_moved) -
> +		bytes_moved = atomic64_read(&adev->num_bytes_moved) -
>   			initial_bytes_moved;
> +		p->bytes_moved += bytes_moved;
> +		if (update_bytes_moved_vis)
> +			p->bytes_moved_vis += bytes_moved;
>   
>   		if (unlikely(r))
>   			break;
> @@ -550,8 +602,10 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
>   		list_splice(&need_pages, &p->validated);
>   	}
>   
> -	p->bytes_moved_threshold = amdgpu_cs_get_threshold_for_moves(p->adev);
> +	amdgpu_cs_get_threshold_for_moves(p->adev, &p->bytes_moved_threshold,
> +					  &p->bytes_moved_vis_threshold);
>   	p->bytes_moved = 0;
> +	p->bytes_moved_vis = 0;
>   	p->evictable = list_last_entry(&p->validated,
>   				       struct amdgpu_bo_list_entry,
>   				       tv.head);
> @@ -575,8 +629,8 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
>   		goto error_validate;
>   	}
>   
> -	amdgpu_cs_report_moved_bytes(p->adev, p->bytes_moved);
> -
> +	amdgpu_cs_report_moved_bytes(p->adev, p->bytes_moved,
> +				     p->bytes_moved_vis);
>   	fpriv->vm.last_eviction_counter =
>   		atomic64_read(&p->adev->num_evictions);
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> index a85e75327456..e429829ae93d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> @@ -322,7 +322,7 @@ int amdgpu_bo_create_restricted(struct amdgpu_device *adev,
>   	struct amdgpu_bo *bo;
>   	enum ttm_bo_type type;
>   	unsigned long page_align;
> -	u64 initial_bytes_moved;
> +	u64 initial_bytes_moved, bytes_moved;
>   	size_t acc_size;
>   	int r;
>   
> @@ -398,8 +398,14 @@ int amdgpu_bo_create_restricted(struct amdgpu_device *adev,
>   	r = ttm_bo_init_reserved(&adev->mman.bdev, &bo->tbo, size, type,
>   				 &bo->placement, page_align, !kernel, NULL,
>   				 acc_size, sg, resv, &amdgpu_ttm_bo_destroy);
> -	amdgpu_cs_report_moved_bytes(adev,
> -		atomic64_read(&adev->num_bytes_moved) - initial_bytes_moved);
> +	bytes_moved = atomic64_read(&adev->num_bytes_moved) -
> +		      initial_bytes_moved;
> +	if (adev->mc.visible_vram_size < adev->mc.real_vram_size &&
> +	    bo->tbo.mem.mem_type == TTM_PL_VRAM &&
> +	    bo->tbo.mem.start < adev->mc.visible_vram_size >> PAGE_SHIFT)
> +		amdgpu_cs_report_moved_bytes(adev, bytes_moved, bytes_moved);
> +	else
> +		amdgpu_cs_report_moved_bytes(adev, bytes_moved, 0);
>   
>   	if (unlikely(r != 0))
>   		return r;