[PATCH] drm/amdgpu: throttle buffer migrations at CS using a fixed MBps limit

Wed Aug 24 08:05:37 UTC 2016

> +		struct mutex		lock;
I haven't completely checked the code, but it looked like you just made 
a bunch of calculations while holding the lock (e.g. no sleeping).

So it might be more appropriate to use a spinlock here (less overhead).

Apart from that this looks really good to me, patch is Acked-by: 
Christian König <christian.koenig at amd.com>.

BTW Regarding VRAM fragmentation: Could it be that most align 
restrictions Mesa forwards to the kernel became nonsense with VM support?

I mean for tilling the memory must be aligned in the VM, but not 
physically any more. Doesn't it?

Regards,
Christian.

Am 24.08.2016 um 00:19 schrieb Marek Olšák:
> From: Marek Olšák <marek.olsak at amd.com>
>
> The old mechanism used a per-submission limit that didn't take previous
> submissions within the same time frame into account. It also filled VRAM
> slowly when VRAM usage dropped due to a big eviction or buffer deallocation.
>
> This new method establishes a configurable MBps limit that is obeyed when
> VRAM usage is very high. When VRAM usage is not very high, it gives
> the driver the freedom to fill it quickly. The result is more consistent
> performance.
>
> It can't keep the BO move rate low if lots of evictions are happening due
> to VRAM fragmentation, or if a big buffer is being migrated.
>
> The amdgpu.moverate parameter can be used to set a non-default limit.
> Measurements can be done to find out which amdgpu.moverate setting gives
> the best results.
>
> Mainly APUs and cards with small VRAM will benefit from this. For F1 2015,
> anything with 2 GB VRAM or less will benefit.
>
> Some benchmark results - F1 2015 (Tonga 2GB):
>
> Limit      MinFPS AvgFPS
> Old code:  14     32.6
> 128 MB/s:  28     41
> 64 MB/s:   15.5   43
> 32 MB/s:   28.7   43.4
> 8 MB/s:    27.8   44.4
> 8 MB/s:    21.9   42.8 (different run)
>
> Random drops in Min FPS can still occur (due to fragmented VRAM?), but
> the average FPS is much better. 8 MB/s is probably a good limit for this
> game & the current VRAM management. The random FPS drops are still to be
> tackled.
>
> Signed-off-by: Marek Olšák <marek.olsak at amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h        |   9 ++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c     | 152 ++++++++++++++++++++---------
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  10 ++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    |   4 +
>   4 files changed, 127 insertions(+), 48 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 916da80..8bde95c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -58,20 +58,21 @@
>   #include "amd_powerplay.h"
>   
>   #include "gpu_scheduler.h"
>   
>   /*
>    * Modules parameters.
>    */
>   extern int amdgpu_modeset;
>   extern int amdgpu_vram_limit;
>   extern int amdgpu_gart_size;
> +extern int amdgpu_moverate;
>   extern int amdgpu_benchmarking;
>   extern int amdgpu_testing;
>   extern int amdgpu_audio;
>   extern int amdgpu_disp_priority;
>   extern int amdgpu_hw_i2c;
>   extern int amdgpu_pcie_gen2;
>   extern int amdgpu_msi;
>   extern int amdgpu_lockup_timeout;
>   extern int amdgpu_dpm;
>   extern int amdgpu_smc_load_fw;
> @@ -2029,20 +2030,28 @@ struct amdgpu_device {
>   	struct amdgpu_mman		mman;
>   	struct amdgpu_vram_scratch	vram_scratch;
>   	struct amdgpu_wb		wb;
>   	atomic64_t			vram_usage;
>   	atomic64_t			vram_vis_usage;
>   	atomic64_t			gtt_usage;
>   	atomic64_t			num_bytes_moved;
>   	atomic64_t			num_evictions;
>   	atomic_t			gpu_reset_counter;
>   
> +	/* data for buffer migration throttling */
> +	struct {
> +		struct mutex		lock;
> +		s64			last_update_us;
> +		s64			accum_us; /* accumulated microseconds */
> +		u32			log2_max_MBps;
> +	} mm_stats;
> +
>   	/* display */
>   	bool				enable_virtual_display;
>   	struct amdgpu_mode_info		mode_info;
>   	/* For pre-DCE11. DCE11 and later are in "struct amdgpu_device->dm" */
>   	struct work_struct		hotplug_work;
>   	struct amdgpu_irq_src		crtc_irq;
>   	struct amdgpu_irq_src		pageflip_irq;
>   	struct amdgpu_irq_src		hpd_irq;
>   
>   	/* rings */
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> index d5d61a7..c6b4946 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> @@ -229,91 +229,145 @@ free_partial_kdata:
>   		drm_free_large(p->chunks[i].kdata);
>   	kfree(p->chunks);
>   put_ctx:
>   	amdgpu_ctx_put(p->ctx);
>   free_chunk:
>   	kfree(chunk_array);
>   
>   	return ret;
>   }
>   
> -/* Returns how many bytes TTM can move per IB.
> +/* Convert microseconds to bytes. */
> +static u64 us_to_bytes(struct amdgpu_device *adev, s64 us)
> +{
> +	if (us <= 0 || !adev->mm_stats.log2_max_MBps)
> +		return 0;
> +
> +	/* Since accum_us is incremented by a million per second, just
> +	 * multiply it by the number of MB/s to get the number of bytes.
> +	 */
> +	return us << adev->mm_stats.log2_max_MBps;
> +}
> +
> +static s64 bytes_to_us(struct amdgpu_device *adev, u64 bytes)
> +{
> +	if (!adev->mm_stats.log2_max_MBps)
> +		return 0;
> +
> +	return bytes >> adev->mm_stats.log2_max_MBps;
> +}
> +
> +/* Returns how many bytes TTM can move right now. If no bytes can be moved,
> + * it returns 0. If it returns non-zero, it's OK to move at least one buffer,
> + * which means it can go over the threshold once. If that happens, the driver
> + * will be in debt and no other buffer migrations can be done until that debt
> + * is repaid.
> + *
> + * This approach allows moving a buffer of any size (it's important to allow
> + * that).
> + *
> + * The currency is simply time in microseconds and it increases as the clock
> + * ticks. The accumulated microseconds (us) are converted to bytes and
> + * returned.
>    */
>   static u64 amdgpu_cs_get_threshold_for_moves(struct amdgpu_device *adev)
>   {
> -	u64 real_vram_size = adev->mc.real_vram_size;
> -	u64 vram_usage = atomic64_read(&adev->vram_usage);
> +	s64 time_us, increment_us;
> +	u64 max_bytes;
> +	u64 free_vram, total_vram, used_vram;
>   
> -	/* This function is based on the current VRAM usage.
> +	/* Allow a maximum of 200 accumulated ms. This is basically per-IB
> +	 * throttling.
>   	 *
> -	 * - If all of VRAM is free, allow relocating the number of bytes that
> -	 *   is equal to 1/4 of the size of VRAM for this IB.
> +	 * It means that in order to get full max MBps, at least 5 IBs per
> +	 * second must be submitted and not more than 200ms apart from each
> +	 * other.
> +	 */
> +	const s64 us_upper_bound = 200000;
>   
> -	 * - If more than one half of VRAM is occupied, only allow relocating
> -	 *   1 MB of data for this IB.
> -	 *
> -	 * - From 0 to one half of used VRAM, the threshold decreases
> -	 *   linearly.
> -	 *         __________________
> -	 * 1/4 of -|\               |
> -	 * VRAM    | \              |
> -	 *         |  \             |
> -	 *         |   \            |
> -	 *         |    \           |
> -	 *         |     \          |
> -	 *         |      \         |
> -	 *         |       \________|1 MB
> -	 *         |----------------|
> -	 *    VRAM 0 %             100 %
> -	 *         used            used
> -	 *
> -	 * Note: It's a threshold, not a limit. The threshold must be crossed
> -	 * for buffer relocations to stop, so any buffer of an arbitrary size
> -	 * can be moved as long as the threshold isn't crossed before
> -	 * the relocation takes place. We don't want to disable buffer
> -	 * relocations completely.
> +	if (!adev->mm_stats.log2_max_MBps)
> +		return 0;
> +
> +	total_vram = adev->mc.real_vram_size - adev->vram_pin_size;
> +	used_vram = atomic64_read(&adev->vram_usage);
> +	free_vram = used_vram >= total_vram ? 0 : total_vram - used_vram;
> +
> +	mutex_lock(&adev->mm_stats.lock);
> +
> +	/* Increase the amount of accumulated us. */
> +	time_us = ktime_to_us(ktime_get());
> +	increment_us = time_us - adev->mm_stats.last_update_us;
> +	adev->mm_stats.last_update_us = time_us;
> +	adev->mm_stats.accum_us = min(adev->mm_stats.accum_us + increment_us,
> +                                      us_upper_bound);
> +
> +	/* This prevents the short period of low performance when the VRAM
> +	 * usage is low and the driver is in debt or doesn't have enough
> +	 * accumulated us to fill VRAM quickly.
>   	 *
> -	 * The idea is that buffers should be placed in VRAM at creation time
> -	 * and TTM should only do a minimum number of relocations during
> -	 * command submission. In practice, you need to submit at least
> -	 * a dozen IBs to move all buffers to VRAM if they are in GTT.
> +	 * The situation can occur in these cases:
> +	 * - a lot of VRAM is freed by userspace
> +	 * - the presence of a big buffer causes a lot of evictions
> +	 *   (solution: split buffers into smaller ones)
>   	 *
> -	 * Also, things can get pretty crazy under memory pressure and actual
> -	 * VRAM usage can change a lot, so playing safe even at 50% does
> -	 * consistently increase performance.
> +	 * If 128 MB or 1/8th of VRAM is free, start filling it now by setting
> +	 * accum_us to a positive number.
>   	 */
> +	if (free_vram >= 128 * 1024 * 1024 || free_vram >= total_vram / 8) {
> +		s64 min_us;
> +
> +		/* Be more aggresive on dGPUs. Try to fill a portion of free
> +		 * VRAM now.
> +		 */
> +		if (!(adev->flags & AMD_IS_APU))
> +			min_us = bytes_to_us(adev, free_vram / 4);
> +		else
> +			min_us = 0; /* Reset accum_us on APUs. */
> +
> +		adev->mm_stats.accum_us = max(min_us, adev->mm_stats.accum_us);
> +	}
>   
> -	u64 half_vram = real_vram_size >> 1;
> -	u64 half_free_vram = vram_usage >= half_vram ? 0 : half_vram - vram_usage;
> -	u64 bytes_moved_threshold = half_free_vram >> 1;
> -	return max(bytes_moved_threshold, 1024*1024ull);
> +	/* This returns 0 if the driver is in debt to disallow (optional)
> +	 * buffer moves.
> +	 */
> +	max_bytes = us_to_bytes(adev, adev->mm_stats.accum_us);
> +
> +	mutex_unlock(&adev->mm_stats.lock);
> +	return max_bytes;
> +}
> +
> +/* Report how many bytes have really been moved for the last command
> + * submission. This can result in a debt that can stop buffer migrations
> + * temporarily.
> + */
> +static void amdgpu_cs_report_moved_bytes(struct amdgpu_device *adev,
> +					 u64 num_bytes)
> +{
> +	mutex_lock(&adev->mm_stats.lock);
> +	adev->mm_stats.accum_us -= bytes_to_us(adev, num_bytes);
> +	mutex_unlock(&adev->mm_stats.lock);
>   }
>   
>   static int amdgpu_cs_bo_validate(struct amdgpu_cs_parser *p,
>   				 struct amdgpu_bo *bo)
>   {
>   	u64 initial_bytes_moved;
>   	uint32_t domain;
>   	int r;
>   
>   	if (bo->pin_count)
>   		return 0;
>   
> -	/* Avoid moving this one if we have moved too many buffers
> -	 * for this IB already.
> -	 *
> -	 * Note that this allows moving at least one buffer of
> -	 * any size, because it doesn't take the current "bo"
> -	 * into account. We don't want to disallow buffer moves
> -	 * completely.
> +	/* Don't move this buffer if we have depleted our allowance
> +	 * to move it. Don't move anything if the threshold is zero.
>   	 */
> -	if (p->bytes_moved <= p->bytes_moved_threshold)
> +	if (p->bytes_moved < p->bytes_moved_threshold)
>   		domain = bo->prefered_domains;
>   	else
>   		domain = bo->allowed_domains;
>   
>   retry:
>   	amdgpu_ttm_placement_from_domain(bo, domain);
>   	initial_bytes_moved = atomic64_read(&bo->adev->num_bytes_moved);
>   	r = ttm_bo_validate(&bo->tbo, &bo->placement, true, false);
>   	p->bytes_moved += atomic64_read(&bo->adev->num_bytes_moved) -
>   		initial_bytes_moved;
> @@ -488,20 +542,22 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
>   		DRM_ERROR("amdgpu_cs_list_validate(duplicates) failed.\n");
>   		goto error_validate;
>   	}
>   
>   	r = amdgpu_cs_list_validate(p, &p->validated);
>   	if (r) {
>   		DRM_ERROR("amdgpu_cs_list_validate(validated) failed.\n");
>   		goto error_validate;
>   	}
>   
> +	amdgpu_cs_report_moved_bytes(p->adev, p->bytes_moved);
> +
>   	fpriv->vm.last_eviction_counter =
>   		atomic64_read(&p->adev->num_evictions);
>   
>   	if (p->bo_list) {
>   		struct amdgpu_bo *gds = p->bo_list->gds_obj;
>   		struct amdgpu_bo *gws = p->bo_list->gws_obj;
>   		struct amdgpu_bo *oa = p->bo_list->oa_obj;
>   		struct amdgpu_vm *vm = &fpriv->vm;
>   		unsigned i;
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 2608138..9eae198 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -1521,20 +1521,21 @@ static bool amdgpu_device_is_virtual(void)
>    * Returns 0 for success or an error on failure.
>    * Called at driver startup.
>    */
>   int amdgpu_device_init(struct amdgpu_device *adev,
>   		       struct drm_device *ddev,
>   		       struct pci_dev *pdev,
>   		       uint32_t flags)
>   {
>   	int r, i;
>   	bool runtime = false;
> +	u32 max_MBps;
>   
>   	adev->shutdown = false;
>   	adev->dev = &pdev->dev;
>   	adev->ddev = ddev;
>   	adev->pdev = pdev;
>   	adev->flags = flags;
>   	adev->asic_type = flags & AMD_ASIC_MASK;
>   	adev->is_atom_bios = false;
>   	adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT;
>   	adev->mc.gtt_size = 512 * 1024 * 1024;
> @@ -1566,20 +1567,21 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>   		 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision);
>   
>   	/* mutex initialization are all done here so we
>   	 * can recall function without having locking issues */
>   	mutex_init(&adev->vm_manager.lock);
>   	atomic_set(&adev->irq.ih.lock, 0);
>   	mutex_init(&adev->pm.mutex);
>   	mutex_init(&adev->gfx.gpu_clock_mutex);
>   	mutex_init(&adev->srbm_mutex);
>   	mutex_init(&adev->grbm_idx_mutex);
> +	mutex_init(&adev->mm_stats.lock);
>   	mutex_init(&adev->mn_lock);
>   	hash_init(adev->mn_hash);
>   
>   	amdgpu_check_arguments(adev);
>   
>   	/* Registers mapping */
>   	/* TODO: block userspace mapping of io register */
>   	spin_lock_init(&adev->mmio_idx_lock);
>   	spin_lock_init(&adev->smc_idx_lock);
>   	spin_lock_init(&adev->pcie_idx_lock);
> @@ -1693,20 +1695,28 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>   
>   	r = amdgpu_init(adev);
>   	if (r) {
>   		dev_err(adev->dev, "amdgpu_init failed\n");
>   		amdgpu_fini(adev);
>   		goto failed;
>   	}
>   
>   	adev->accel_working = true;
>   
> +	/* Initialize the buffer migration limit. */
> +	if (amdgpu_moverate >= 0)
> +		max_MBps = amdgpu_moverate;
> +	else
> +		max_MBps = 8; /* Allow 8 MB/s. */
> +	/* Get a log2 for easy divisions. */
> +	adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps));
> +
>   	amdgpu_fbdev_init(adev);
>   
>   	r = amdgpu_ib_pool_init(adev);
>   	if (r) {
>   		dev_err(adev->dev, "IB initialization failed (%d).\n", r);
>   		goto failed;
>   	}
>   
>   	r = amdgpu_ib_ring_tests(adev);
>   	if (r)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> index 5bcfeed..fe2d26b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> @@ -54,20 +54,21 @@
>    *           at the end of IBs.
>    * - 3.3.0 - Add VM support for UVD on supported hardware.
>    * - 3.4.0 - Add AMDGPU_INFO_NUM_EVICTIONS.
>    */
>   #define KMS_DRIVER_MAJOR	3
>   #define KMS_DRIVER_MINOR	4
>   #define KMS_DRIVER_PATCHLEVEL	0
>   
>   int amdgpu_vram_limit = 0;
>   int amdgpu_gart_size = -1; /* auto */
> +int amdgpu_moverate = -1; /* auto */
>   int amdgpu_benchmarking = 0;
>   int amdgpu_testing = 0;
>   int amdgpu_audio = -1;
>   int amdgpu_disp_priority = 0;
>   int amdgpu_hw_i2c = 0;
>   int amdgpu_pcie_gen2 = -1;
>   int amdgpu_msi = -1;
>   int amdgpu_lockup_timeout = 0;
>   int amdgpu_dpm = -1;
>   int amdgpu_smc_load_fw = 1;
> @@ -92,20 +93,23 @@ unsigned amdgpu_pcie_lane_cap = 0;
>   unsigned amdgpu_cg_mask = 0xffffffff;
>   unsigned amdgpu_pg_mask = 0xffffffff;
>   char *amdgpu_virtual_display = NULL;
>   
>   MODULE_PARM_DESC(vramlimit, "Restrict VRAM for testing, in megabytes");
>   module_param_named(vramlimit, amdgpu_vram_limit, int, 0600);
>   
>   MODULE_PARM_DESC(gartsize, "Size of PCIE/IGP gart to setup in megabytes (32, 64, etc., -1 = auto)");
>   module_param_named(gartsize, amdgpu_gart_size, int, 0600);
>   
> +MODULE_PARM_DESC(moverate, "Maximum buffer migration rate in MB/s. (32, 64, etc., -1=auto, 0=1=disabled)");
> +module_param_named(moverate, amdgpu_moverate, int, 0600);
> +
>   MODULE_PARM_DESC(benchmark, "Run benchmark");
>   module_param_named(benchmark, amdgpu_benchmarking, int, 0444);
>   
>   MODULE_PARM_DESC(test, "Run tests");
>   module_param_named(test, amdgpu_testing, int, 0444);
>   
>   MODULE_PARM_DESC(audio, "Audio enable (-1 = auto, 0 = disable, 1 = enable)");
>   module_param_named(audio, amdgpu_audio, int, 0444);
>   
>   MODULE_PARM_DESC(disp_priority, "Display Priority (0 = auto, 1 = normal, 2 = high)");