[PATCH v3 2/5] drm/amdgpu: Throttle visible VRAM moves separately
John Brooks
john at fastquake.com
Fri Jul 7 22:23:27 UTC 2017
On Thu, Jul 06, 2017 at 07:51:27PM +0900, Michel Dänzer wrote:
> From: John Brooks <john at fastquake.com>
>
> The BO move throttling code is designed to allow VRAM to fill quickly if it
> is relatively empty. However, this does not take into account situations
> where the visible VRAM is smaller than total VRAM, and total VRAM may not
> be close to full but the visible VRAM segment is under pressure. In such
> situations, visible VRAM would experience unrestricted swapping and
> performance would drop.
>
> Add a separate counter specifically for moves involving visible VRAM, and
> check it before moving BOs there.
>
> v2: Only perform calculations for separate counter if visible VRAM is
> smaller than total VRAM. (Michel Dänzer)
> v3: [Michel Dänzer]
> * Use BO's location rather than the AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED
> flag to determine whether to account a move for visible VRAM in most
> cases.
> * Use a single
>
> if (adev->mc.visible_vram_size < adev->mc.real_vram_size) {
>
> block in amdgpu_cs_get_threshold_for_moves.
>
> Fixes: 95844d20ae02 (drm/amdgpu: throttle buffer migrations at CS using a fixed MBps limit (v2))
> Signed-off-by: John Brooks <john at fastquake.com>
> Signed-off-by: Michel Dänzer <michel.daenzer at amd.com>
Changes look good. You can have my
Reviewed-by: John Brooks <john at fastquake.com>
if you want it.
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 6 +-
> drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 92 ++++++++++++++++++++++++------
> drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 12 +++-
> 3 files changed, 87 insertions(+), 23 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index b95c1074d42c..463d6c241157 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -1162,7 +1162,9 @@ struct amdgpu_cs_parser {
> struct list_head validated;
> struct dma_fence *fence;
> uint64_t bytes_moved_threshold;
> + uint64_t bytes_moved_vis_threshold;
> uint64_t bytes_moved;
> + uint64_t bytes_moved_vis;
> struct amdgpu_bo_list_entry *evictable;
>
> /* user fence */
> @@ -1596,6 +1598,7 @@ struct amdgpu_device {
> spinlock_t lock;
> s64 last_update_us;
> s64 accum_us; /* accumulated microseconds */
> + s64 accum_us_vis; /* for visible VRAM */
> u32 log2_max_MBps;
> } mm_stats;
>
> @@ -1892,7 +1895,8 @@ void amdgpu_pci_config_reset(struct amdgpu_device *adev);
> bool amdgpu_need_post(struct amdgpu_device *adev);
> void amdgpu_update_display_priority(struct amdgpu_device *adev);
>
> -void amdgpu_cs_report_moved_bytes(struct amdgpu_device *adev, u64 num_bytes);
> +void amdgpu_cs_report_moved_bytes(struct amdgpu_device *adev, u64 num_bytes,
> + u64 num_vis_bytes);
> void amdgpu_ttm_placement_from_domain(struct amdgpu_bo *abo, u32 domain);
> bool amdgpu_ttm_bo_is_amdgpu_bo(struct ttm_buffer_object *bo);
> int amdgpu_ttm_tt_get_user_pages(struct ttm_tt *ttm, struct page **pages);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> index 82131d70a06b..44ec11d4d733 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> @@ -220,10 +220,11 @@ static s64 bytes_to_us(struct amdgpu_device *adev, u64 bytes)
> * ticks. The accumulated microseconds (us) are converted to bytes and
> * returned.
> */
> -static u64 amdgpu_cs_get_threshold_for_moves(struct amdgpu_device *adev)
> +static void amdgpu_cs_get_threshold_for_moves(struct amdgpu_device *adev,
> + u64 *max_bytes,
> + u64 *max_vis_bytes)
> {
> s64 time_us, increment_us;
> - u64 max_bytes;
> u64 free_vram, total_vram, used_vram;
>
> /* Allow a maximum of 200 accumulated ms. This is basically per-IB
> @@ -235,8 +236,11 @@ static u64 amdgpu_cs_get_threshold_for_moves(struct amdgpu_device *adev)
> */
> const s64 us_upper_bound = 200000;
>
> - if (!adev->mm_stats.log2_max_MBps)
> - return 0;
> + if (!adev->mm_stats.log2_max_MBps) {
> + *max_bytes = 0;
> + *max_vis_bytes = 0;
> + return;
> + }
>
> total_vram = adev->mc.real_vram_size - adev->vram_pin_size;
> used_vram = atomic64_read(&adev->vram_usage);
> @@ -277,23 +281,45 @@ static u64 amdgpu_cs_get_threshold_for_moves(struct amdgpu_device *adev)
> adev->mm_stats.accum_us = max(min_us, adev->mm_stats.accum_us);
> }
>
> - /* This returns 0 if the driver is in debt to disallow (optional)
> + /* This is set to 0 if the driver is in debt to disallow (optional)
> * buffer moves.
> */
> - max_bytes = us_to_bytes(adev, adev->mm_stats.accum_us);
> + *max_bytes = us_to_bytes(adev, adev->mm_stats.accum_us);
> +
> + /* Do the same for visible VRAM if half of it is free */
> + if (adev->mc.visible_vram_size < adev->mc.real_vram_size) {
> + u64 total_vis_vram = adev->mc.visible_vram_size;
> + u64 used_vis_vram = atomic64_read(&adev->vram_vis_usage);
> +
> + if (used_vis_vram < total_vis_vram) {
> + u64 free_vis_vram = total_vis_vram - used_vis_vram;
> + adev->mm_stats.accum_us_vis = min(adev->mm_stats.accum_us_vis +
> + increment_us, us_upper_bound);
> +
> + if (free_vis_vram >= total_vis_vram / 2)
> + adev->mm_stats.accum_us_vis =
> + max(bytes_to_us(adev, free_vis_vram / 2),
> + adev->mm_stats.accum_us_vis);
> + }
> +
> + *max_vis_bytes = us_to_bytes(adev, adev->mm_stats.accum_us_vis);
> + } else {
> + *max_vis_bytes = 0;
> + }
>
> spin_unlock(&adev->mm_stats.lock);
> - return max_bytes;
> }
>
> /* Report how many bytes have really been moved for the last command
> * submission. This can result in a debt that can stop buffer migrations
> * temporarily.
> */
> -void amdgpu_cs_report_moved_bytes(struct amdgpu_device *adev, u64 num_bytes)
> +void amdgpu_cs_report_moved_bytes(struct amdgpu_device *adev, u64 num_bytes,
> + u64 num_vis_bytes)
> {
> spin_lock(&adev->mm_stats.lock);
> adev->mm_stats.accum_us -= bytes_to_us(adev, num_bytes);
> + adev->mm_stats.accum_us_vis -= bytes_to_us(adev, num_vis_bytes);
> spin_unlock(&adev->mm_stats.lock);
> }
>
> @@ -301,7 +327,7 @@ static int amdgpu_cs_bo_validate(struct amdgpu_cs_parser *p,
> struct amdgpu_bo *bo)
> {
> struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev);
> - u64 initial_bytes_moved;
> + u64 initial_bytes_moved, bytes_moved;
> uint32_t domain;
> int r;
>
> @@ -311,17 +337,35 @@ static int amdgpu_cs_bo_validate(struct amdgpu_cs_parser *p,
> /* Don't move this buffer if we have depleted our allowance
> * to move it. Don't move anything if the threshold is zero.
> */
> - if (p->bytes_moved < p->bytes_moved_threshold)
> - domain = bo->prefered_domains;
> - else
> + if (p->bytes_moved < p->bytes_moved_threshold) {
> + if (adev->mc.visible_vram_size < adev->mc.real_vram_size &&
> + (bo->flags & AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED)) {
> + /* And don't move a CPU_ACCESS_REQUIRED BO to limited
> + * visible VRAM if we've depleted our allowance to do
> + * that.
> + */
> + if (p->bytes_moved_vis < p->bytes_moved_vis_threshold)
> + domain = bo->prefered_domains;
> + else
> + domain = bo->allowed_domains;
> + } else {
> + domain = bo->prefered_domains;
> + }
> + } else {
> domain = bo->allowed_domains;
> + }
>
> retry:
> amdgpu_ttm_placement_from_domain(bo, domain);
> initial_bytes_moved = atomic64_read(&adev->num_bytes_moved);
> r = ttm_bo_validate(&bo->tbo, &bo->placement, true, false);
> - p->bytes_moved += atomic64_read(&adev->num_bytes_moved) -
> - initial_bytes_moved;
> + bytes_moved = atomic64_read(&adev->num_bytes_moved) -
> + initial_bytes_moved;
> + p->bytes_moved += bytes_moved;
> + if (adev->mc.visible_vram_size < adev->mc.real_vram_size &&
> + bo->tbo.mem.mem_type == TTM_PL_VRAM &&
> + bo->tbo.mem.start < adev->mc.visible_vram_size >> PAGE_SHIFT)
> + p->bytes_moved_vis += bytes_moved;
>
> if (unlikely(r == -ENOMEM) && domain != bo->allowed_domains) {
> domain = bo->allowed_domains;
> @@ -347,7 +391,8 @@ static bool amdgpu_cs_try_evict(struct amdgpu_cs_parser *p,
> struct amdgpu_bo_list_entry *candidate = p->evictable;
> struct amdgpu_bo *bo = candidate->robj;
> struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev);
> - u64 initial_bytes_moved;
> + u64 initial_bytes_moved, bytes_moved;
> + bool update_bytes_moved_vis;
> uint32_t other;
>
> /* If we reached our current BO we can forget it */
> @@ -367,10 +412,17 @@ static bool amdgpu_cs_try_evict(struct amdgpu_cs_parser *p,
>
> /* Good we can try to move this BO somewhere else */
> amdgpu_ttm_placement_from_domain(bo, other);
> + update_bytes_moved_vis =
> + adev->mc.visible_vram_size < adev->mc.real_vram_size &&
> + bo->tbo.mem.mem_type == TTM_PL_VRAM &&
> + bo->tbo.mem.start < adev->mc.visible_vram_size >> PAGE_SHIFT;
> initial_bytes_moved = atomic64_read(&adev->num_bytes_moved);
> r = ttm_bo_validate(&bo->tbo, &bo->placement, true, false);
> - p->bytes_moved += atomic64_read(&adev->num_bytes_moved) -
> + bytes_moved = atomic64_read(&adev->num_bytes_moved) -
> initial_bytes_moved;
> + p->bytes_moved += bytes_moved;
> + if (update_bytes_moved_vis)
> + p->bytes_moved_vis += bytes_moved;
>
> if (unlikely(r))
> break;
> @@ -550,8 +602,10 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
> list_splice(&need_pages, &p->validated);
> }
>
> - p->bytes_moved_threshold = amdgpu_cs_get_threshold_for_moves(p->adev);
> + amdgpu_cs_get_threshold_for_moves(p->adev, &p->bytes_moved_threshold,
> + &p->bytes_moved_vis_threshold);
> p->bytes_moved = 0;
> + p->bytes_moved_vis = 0;
> p->evictable = list_last_entry(&p->validated,
> struct amdgpu_bo_list_entry,
> tv.head);
> @@ -575,8 +629,8 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
> goto error_validate;
> }
>
> - amdgpu_cs_report_moved_bytes(p->adev, p->bytes_moved);
> -
> + amdgpu_cs_report_moved_bytes(p->adev, p->bytes_moved,
> + p->bytes_moved_vis);
> fpriv->vm.last_eviction_counter =
> atomic64_read(&p->adev->num_evictions);
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> index a85e75327456..e429829ae93d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> @@ -322,7 +322,7 @@ int amdgpu_bo_create_restricted(struct amdgpu_device *adev,
> struct amdgpu_bo *bo;
> enum ttm_bo_type type;
> unsigned long page_align;
> - u64 initial_bytes_moved;
> + u64 initial_bytes_moved, bytes_moved;
> size_t acc_size;
> int r;
>
> @@ -398,8 +398,14 @@ int amdgpu_bo_create_restricted(struct amdgpu_device *adev,
> r = ttm_bo_init_reserved(&adev->mman.bdev, &bo->tbo, size, type,
> &bo->placement, page_align, !kernel, NULL,
> acc_size, sg, resv, &amdgpu_ttm_bo_destroy);
> - amdgpu_cs_report_moved_bytes(adev,
> - atomic64_read(&adev->num_bytes_moved) - initial_bytes_moved);
> + bytes_moved = atomic64_read(&adev->num_bytes_moved) -
> + initial_bytes_moved;
> + if (adev->mc.visible_vram_size < adev->mc.real_vram_size &&
> + bo->tbo.mem.mem_type == TTM_PL_VRAM &&
> + bo->tbo.mem.start < adev->mc.visible_vram_size >> PAGE_SHIFT)
> + amdgpu_cs_report_moved_bytes(adev, bytes_moved, bytes_moved);
> + else
> + amdgpu_cs_report_moved_bytes(adev, bytes_moved, 0);
>
> if (unlikely(r != 0))
> return r;
> --
> 2.13.2
>
More information about the amd-gfx
mailing list