[PATCH] drm/vc4: Allow using more than 256MB of CMA memory.

Fri Apr 14 08:08:25 UTC 2017

On Mon, 27 Mar 2017 16:10:25 -0700
Eric Anholt <eric at anholt.net> wrote:

> Until now, we've had to limit Raspberry Pi to 256MB of CMA memory to
> keep from triggering the hardware addressing bug between of the tile
> binner of the tile alloc memory (where the top 4 bits come from the
> tile state data array's address).
> 
> To work around that and allow more memory to be reserved for graphics,
> allocate a single BO to store tile state data arrays and tile
> alloc/overflow memory while the GPU is active, and make sure that that
> one BO doesn't happen to cross a 256MB boundary.  With that in place,
> we can allocate textures and shaders anywhere in system memory (still
> contiguous, of course).

It's hard to review something I don't quite understand, but I didn't
spot any problem and the code seems to follow what the commit message
says: make sure the memory used by the tile binner (still have to look
at what a tile binner is :-)) does not cross a 256MB.

So, not sure my review has a real value here, but

Reviewed-by: Boris Brezillon <boris.brezillon at free-electrons.com>

> 
> Signed-off-by: Eric Anholt <eric at anholt.net>
> ---
>  drivers/gpu/drm/vc4/vc4_drv.h       |  28 +++++--
>  drivers/gpu/drm/vc4/vc4_gem.c       |  12 ++-
>  drivers/gpu/drm/vc4/vc4_irq.c       |  61 +++++++--------
>  drivers/gpu/drm/vc4/vc4_render_cl.c |   3 +-
>  drivers/gpu/drm/vc4/vc4_v3d.c       | 150 ++++++++++++++++++++++++++++++++++++
>  drivers/gpu/drm/vc4/vc4_validate.c  |  54 ++++++-------
>  6 files changed, 234 insertions(+), 74 deletions(-)
> 
> diff --git a/drivers/gpu/drm/vc4/vc4_drv.h b/drivers/gpu/drm/vc4/vc4_drv.h
> index dffce6293d87..5d9532163cbe 100644
> --- a/drivers/gpu/drm/vc4/vc4_drv.h
> +++ b/drivers/gpu/drm/vc4/vc4_drv.h
> @@ -95,12 +95,23 @@ struct vc4_dev {
>  	 */
>  	struct list_head seqno_cb_list;
>  
> -	/* The binner overflow memory that's currently set up in
> -	 * BPOA/BPOS registers.  When overflow occurs and a new one is
> -	 * allocated, the previous one will be moved to
> -	 * vc4->current_exec's free list.
> +	/* The memory used for storing binner tile alloc, tile state,
> +	 * and overflow memory allocations.  This is freed when V3D
> +	 * powers down.
>  	 */
> -	struct vc4_bo *overflow_mem;
> +	struct vc4_bo *bin_bo;
> +
> +	/* Size of blocks allocated within bin_bo. */
> +	uint32_t bin_alloc_size;
> +
> +	/* Bitmask of the bin_alloc_size chunks in bin_bo that are
> +	 * used.
> +	 */
> +	uint32_t bin_alloc_used;
> +
> +	/* Bitmask of the current bin_alloc used for overflow memory. */
> +	uint32_t bin_alloc_overflow;
> +
>  	struct work_struct overflow_mem_work;
>  
>  	int power_refcount;
> @@ -293,8 +304,12 @@ struct vc4_exec_info {
>  	bool found_increment_semaphore_packet;
>  	bool found_flush;
>  	uint8_t bin_tiles_x, bin_tiles_y;
> -	struct drm_gem_cma_object *tile_bo;
> +	/* Physical address of the start of the tile alloc array
> +	 * (where each tile's binned CL will start)
> +	 */
>  	uint32_t tile_alloc_offset;
> +	/* Bitmask of which binner slots are freed when this job completes. */
> +	uint32_t bin_slots;
>  
>  	/**
>  	 * Computed addresses pointing into exec_bo where we start the
> @@ -522,6 +537,7 @@ void vc4_plane_async_set_fb(struct drm_plane *plane,
>  extern struct platform_driver vc4_v3d_driver;
>  int vc4_v3d_debugfs_ident(struct seq_file *m, void *unused);
>  int vc4_v3d_debugfs_regs(struct seq_file *m, void *unused);
> +int vc4_v3d_get_bin_slot(struct vc4_dev *vc4);
>  
>  /* vc4_validate.c */
>  int
> diff --git a/drivers/gpu/drm/vc4/vc4_gem.c b/drivers/gpu/drm/vc4/vc4_gem.c
> index e9c381c42139..3ecd1ba7af75 100644
> --- a/drivers/gpu/drm/vc4/vc4_gem.c
> +++ b/drivers/gpu/drm/vc4/vc4_gem.c
> @@ -705,6 +705,7 @@ static void
>  vc4_complete_exec(struct drm_device *dev, struct vc4_exec_info *exec)
>  {
>  	struct vc4_dev *vc4 = to_vc4_dev(dev);
> +	unsigned long irqflags;
>  	unsigned i;
>  
>  	if (exec->bo) {
> @@ -720,6 +721,11 @@ vc4_complete_exec(struct drm_device *dev, struct vc4_exec_info *exec)
>  		drm_gem_object_unreference_unlocked(&bo->base.base);
>  	}
>  
> +	/* Free up the allocation of any bin slots we used. */
> +	spin_lock_irqsave(&vc4->job_lock, irqflags);
> +	vc4->bin_alloc_used &= ~exec->bin_slots;
> +	spin_unlock_irqrestore(&vc4->job_lock, irqflags);
> +
>  	mutex_lock(&vc4->power_lock);
>  	if (--vc4->power_refcount == 0) {
>  		pm_runtime_mark_last_busy(&vc4->v3d->pdev->dev);
> @@ -968,9 +974,9 @@ vc4_gem_destroy(struct drm_device *dev)
>  	/* V3D should already have disabled its interrupt and cleared
>  	 * the overflow allocation registers.  Now free the object.
>  	 */
> -	if (vc4->overflow_mem) {
> -		drm_gem_object_unreference_unlocked(&vc4->overflow_mem->base.base);
> -		vc4->overflow_mem = NULL;
> +	if (vc4->bin_bo) {
> +		drm_gem_object_put_unlocked(&vc4->bin_bo->base.base);
> +		vc4->bin_bo = NULL;
>  	}
>  
>  	if (vc4->hang_state)
> diff --git a/drivers/gpu/drm/vc4/vc4_irq.c b/drivers/gpu/drm/vc4/vc4_irq.c
> index cdc6e6760705..47bbec962f45 100644
> --- a/drivers/gpu/drm/vc4/vc4_irq.c
> +++ b/drivers/gpu/drm/vc4/vc4_irq.c
> @@ -59,50 +59,45 @@ vc4_overflow_mem_work(struct work_struct *work)
>  {
>  	struct vc4_dev *vc4 =
>  		container_of(work, struct vc4_dev, overflow_mem_work);
> -	struct drm_device *dev = vc4->dev;
> -	struct vc4_bo *bo;
> +	struct vc4_bo *bo = vc4->bin_bo;
> +	int bin_bo_slot;
> +	struct vc4_exec_info *exec;
> +	unsigned long irqflags;
>  
> -	bo = vc4_bo_create(dev, 256 * 1024, true);
> -	if (IS_ERR(bo)) {
> +	bin_bo_slot = vc4_v3d_get_bin_slot(vc4);
> +	if (bin_bo_slot < 0) {
>  		DRM_ERROR("Couldn't allocate binner overflow mem\n");
>  		return;
>  	}
>  
> -	/* If there's a job executing currently, then our previous
> -	 * overflow allocation is getting used in that job and we need
> -	 * to queue it to be released when the job is done.  But if no
> -	 * job is executing at all, then we can free the old overflow
> -	 * object direcctly.
> -	 *
> -	 * No lock necessary for this pointer since we're the only
> -	 * ones that update the pointer, and our workqueue won't
> -	 * reenter.
> -	 */
> -	if (vc4->overflow_mem) {
> -		struct vc4_exec_info *current_exec;
> -		unsigned long irqflags;
> -
> -		spin_lock_irqsave(&vc4->job_lock, irqflags);
> -		current_exec = vc4_first_bin_job(vc4);
> -		if (!current_exec)
> -			current_exec = vc4_last_render_job(vc4);
> -		if (current_exec) {
> -			vc4->overflow_mem->seqno = current_exec->seqno;
> -			list_add_tail(&vc4->overflow_mem->unref_head,
> -				      &current_exec->unref_list);
> -			vc4->overflow_mem = NULL;
> +	spin_lock_irqsave(&vc4->job_lock, irqflags);
> +
> +	if (vc4->bin_alloc_overflow) {
> +		/* If we had overflow memory allocated previously,
> +		 * then that chunk will free when the current bin job
> +		 * is done.  If we don't have a bin job running, then
> +		 * the chunk will be done whenever the list of render
> +		 * jobs has drained.
> +		 */
> +		exec = vc4_first_bin_job(vc4);
> +		if (!exec)
> +			exec = vc4_last_render_job(vc4);
> +		if (exec) {
> +			exec->bin_slots |= vc4->bin_alloc_overflow;
> +		} else {
> +			/* There's nothing queued in the hardware, so
> +			 * the old slot is free immediately.
> +			 */
> +			vc4->bin_alloc_used &= ~vc4->bin_alloc_overflow;
>  		}
> -		spin_unlock_irqrestore(&vc4->job_lock, irqflags);
>  	}
> +	vc4->bin_alloc_overflow = BIT(bin_bo_slot);
>  
> -	if (vc4->overflow_mem)
> -		drm_gem_object_unreference_unlocked(&vc4->overflow_mem->base.base);
> -	vc4->overflow_mem = bo;
> -
> -	V3D_WRITE(V3D_BPOA, bo->base.paddr);
> +	V3D_WRITE(V3D_BPOA, bo->base.paddr + bin_bo_slot * vc4->bin_alloc_size);
>  	V3D_WRITE(V3D_BPOS, bo->base.base.size);
>  	V3D_WRITE(V3D_INTCTL, V3D_INT_OUTOMEM);
>  	V3D_WRITE(V3D_INTENA, V3D_INT_OUTOMEM);
> +	spin_unlock_irqrestore(&vc4->job_lock, irqflags);
>  }
>  
>  static void
> diff --git a/drivers/gpu/drm/vc4/vc4_render_cl.c b/drivers/gpu/drm/vc4/vc4_render_cl.c
> index 4339471f517f..5dc19429d4ae 100644
> --- a/drivers/gpu/drm/vc4/vc4_render_cl.c
> +++ b/drivers/gpu/drm/vc4/vc4_render_cl.c
> @@ -182,8 +182,7 @@ static void emit_tile(struct vc4_exec_info *exec,
>  
>  	if (has_bin) {
>  		rcl_u8(setup, VC4_PACKET_BRANCH_TO_SUB_LIST);
> -		rcl_u32(setup, (exec->tile_bo->paddr +
> -				exec->tile_alloc_offset +
> +		rcl_u32(setup, (exec->tile_alloc_offset +
>  				(y * exec->bin_tiles_x + x) * 32));
>  	}
>  
> diff --git a/drivers/gpu/drm/vc4/vc4_v3d.c b/drivers/gpu/drm/vc4/vc4_v3d.c
> index 7cc346ad9b0b..a88078d7c9d1 100644
> --- a/drivers/gpu/drm/vc4/vc4_v3d.c
> +++ b/drivers/gpu/drm/vc4/vc4_v3d.c
> @@ -156,6 +156,144 @@ static void vc4_v3d_init_hw(struct drm_device *dev)
>  	V3D_WRITE(V3D_VPMBASE, 0);
>  }
>  
> +int vc4_v3d_get_bin_slot(struct vc4_dev *vc4)
> +{
> +	struct drm_device *dev = vc4->dev;
> +	unsigned long irqflags;
> +	int slot;
> +	uint64_t seqno = 0;
> +	struct vc4_exec_info *exec;
> +
> +try_again:
> +	spin_lock_irqsave(&vc4->job_lock, irqflags);
> +	slot = ffs(~vc4->bin_alloc_used);
> +	if (slot != 0) {
> +		/* Switch from ffs() bit index to a 0-based index. */
> +		slot--;
> +		vc4->bin_alloc_used |= BIT(slot);
> +		spin_unlock_irqrestore(&vc4->job_lock, irqflags);
> +		return slot;
> +	}
> +
> +	/* Couldn't find an open slot.  Wait for render to complete
> +	 * and try again.
> +	 */
> +	exec = vc4_last_render_job(vc4);
> +	if (exec)
> +		seqno = exec->seqno;
> +	spin_unlock_irqrestore(&vc4->job_lock, irqflags);
> +
> +	if (seqno) {
> +		int ret = vc4_wait_for_seqno(dev, seqno, ~0ull, true);
> +
> +		if (ret == 0)
> +			goto try_again;
> +
> +		return ret;
> +	}
> +
> +	return -ENOMEM;
> +}
> +
> +/**
> + * vc4_allocate_bin_bo() - allocates the memory that will be used for
> + * tile binning.
> + *
> + * The binner has a limitation that the addresses in the tile state
> + * buffer that point into the tile alloc buffer or binner overflow
> + * memory only have 28 bits (256MB), and the top 4 on the bus for
> + * tile alloc references end up coming from the tile state buffer's
> + * address.
> + *
> + * To work around this, we allocate a single large buffer while V3D is
> + * in use, make sure that it has the top 4 bits constant across its
> + * entire extent, and then put the tile state, tile alloc, and binner
> + * overflow memory inside that buffer.
> + *
> + * This creates a limitation where we may not be able to execute a job
> + * if it doesn't fit within the buffer that we allocated up front.
> + * However, it turns out that 16MB is "enough for anybody", and
> + * real-world applications run into allocation failures from the
> + * overall CMA pool before they make scenes complicated enough to run
> + * out of bin space.
> + */
> +int
> +vc4_allocate_bin_bo(struct drm_device *drm)
> +{
> +	struct vc4_dev *vc4 = to_vc4_dev(drm);
> +	struct vc4_v3d *v3d = vc4->v3d;
> +	uint32_t size = 16 * 1024 * 1024;
> +	int ret = 0;
> +	struct list_head list;
> +
> +	/* We may need to try allocating more than once to get a BO
> +	 * that doesn't cross 256MB.  Track the ones we've allocated
> +	 * that failed so far, so that we can free them when we've got
> +	 * one that succeeded (if we freed them right away, our next
> +	 * allocation would probably be the same chunk of memory).
> +	 */
> +	INIT_LIST_HEAD(&list);
> +
> +	while (true) {
> +		struct vc4_bo *bo = vc4_bo_create(drm, size, true);
> +
> +		if (IS_ERR(bo)) {
> +			ret = PTR_ERR(bo);
> +
> +			dev_err(&v3d->pdev->dev,
> +				"Failed to allocate memory for tile binning: "
> +				"%d. You may need to enable CMA or give it "
> +				"more memory.",
> +				ret);
> +			break;
> +		}
> +
> +		/* Check if this BO won't trigger the addressing bug. */
> +		if ((bo->base.paddr & 0xf0000000) ==
> +		    ((bo->base.paddr + bo->base.base.size - 1) & 0xf0000000)) {
> +			vc4->bin_bo = bo;
> +
> +			/* Set up for allocating 512KB chunks of
> +			 * binner memory.  The biggest allocation we
> +			 * need to do is for the initial tile alloc +
> +			 * tile state buffer.  We can render to a
> +			 * maximum of ((2048*2048) / (32*32) = 4096
> +			 * tiles in a frame (until we do floating
> +			 * point rendering, at which point it would be
> +			 * 8192).  Tile state is 48b/tile (rounded to
> +			 * a page), and tile alloc is 32b/tile
> +			 * (rounded to a page), plus a page of extra,
> +			 * for a total of 320kb for our worst-case.
> +			 * We choose 512kb so that it divides evenly
> +			 * into our 16MB, and the rest of the 512kb
> +			 * will be used as storage for the overflow
> +			 * from the initial 32b CL per bin.
> +			 */
> +			vc4->bin_alloc_size = 512 * 1024;
> +			vc4->bin_alloc_used = 0;
> +			vc4->bin_alloc_overflow = 0;
> +			WARN_ON_ONCE(sizeof(vc4->bin_alloc_used) * 8 !=
> +				     bo->base.base.size / vc4->bin_alloc_size);
> +
> +			break;
> +		}
> +
> +		/* Put it on the list to free later, and try again. */
> +		list_add(&bo->unref_head, &list);
> +	}
> +
> +	/* Free all the BOs we allocated but didn't choose. */
> +	while (!list_empty(&list)) {
> +		struct vc4_bo *bo = list_last_entry(&list,
> +						    struct vc4_bo, unref_head);
> +
> +		list_del(&bo->unref_head);
> +		drm_gem_object_put_unlocked(&bo->base.base);
> +	}
> +
> +	return ret;
> +}
> +
>  #ifdef CONFIG_PM
>  static int vc4_v3d_runtime_suspend(struct device *dev)
>  {
> @@ -164,6 +302,9 @@ static int vc4_v3d_runtime_suspend(struct device *dev)
>  
>  	vc4_irq_uninstall(vc4->dev);
>  
> +	drm_gem_object_put_unlocked(&vc4->bin_bo->base.base);
> +	vc4->bin_bo = NULL;
> +
>  	return 0;
>  }
>  
> @@ -171,6 +312,11 @@ static int vc4_v3d_runtime_resume(struct device *dev)
>  {
>  	struct vc4_v3d *v3d = dev_get_drvdata(dev);
>  	struct vc4_dev *vc4 = v3d->vc4;
> +	int ret;
> +
> +	ret = vc4_allocate_bin_bo(vc4->dev);
> +	if (ret)
> +		return ret;
>  
>  	vc4_v3d_init_hw(vc4->dev);
>  	vc4_irq_postinstall(vc4->dev);
> @@ -208,6 +354,10 @@ static int vc4_v3d_bind(struct device *dev, struct device *master, void *data)
>  		return -EINVAL;
>  	}
>  
> +	ret = vc4_allocate_bin_bo(drm);
> +	if (ret)
> +		return ret;
> +
>  	/* Reset the binner overflow address/size at setup, to be sure
>  	 * we don't reuse an old one.
>  	 */
> diff --git a/drivers/gpu/drm/vc4/vc4_validate.c b/drivers/gpu/drm/vc4/vc4_validate.c
> index da6f1e138e8d..3de8f11595c0 100644
> --- a/drivers/gpu/drm/vc4/vc4_validate.c
> +++ b/drivers/gpu/drm/vc4/vc4_validate.c
> @@ -348,10 +348,11 @@ static int
>  validate_tile_binning_config(VALIDATE_ARGS)
>  {
>  	struct drm_device *dev = exec->exec_bo->base.dev;
> -	struct vc4_bo *tile_bo;
> +	struct vc4_dev *vc4 = to_vc4_dev(dev);
>  	uint8_t flags;
> -	uint32_t tile_state_size, tile_alloc_size;
> -	uint32_t tile_count;
> +	uint32_t tile_state_size;
> +	uint32_t tile_count, bin_addr;
> +	int bin_slot;
>  
>  	if (exec->found_tile_binning_mode_config_packet) {
>  		DRM_ERROR("Duplicate VC4_PACKET_TILE_BINNING_MODE_CONFIG\n");
> @@ -377,13 +378,28 @@ validate_tile_binning_config(VALIDATE_ARGS)
>  		return -EINVAL;
>  	}
>  
> +	bin_slot = vc4_v3d_get_bin_slot(vc4);
> +	if (bin_slot < 0) {
> +		if (bin_slot != -EINTR && bin_slot != -ERESTARTSYS) {
> +			DRM_ERROR("Failed to allocate binner memory: %d\n",
> +				  bin_slot);
> +		}
> +		return bin_slot;
> +	}
> +
> +	/* The slot we allocated will only be used by this job, and is
> +	 * free when the job completes rendering.
> +	 */
> +	exec->bin_slots |= BIT(bin_slot);
> +	bin_addr = vc4->bin_bo->base.paddr + bin_slot * vc4->bin_alloc_size;
> +
>  	/* The tile state data array is 48 bytes per tile, and we put it at
>  	 * the start of a BO containing both it and the tile alloc.
>  	 */
>  	tile_state_size = 48 * tile_count;
>  
>  	/* Since the tile alloc array will follow us, align. */
> -	exec->tile_alloc_offset = roundup(tile_state_size, 4096);
> +	exec->tile_alloc_offset = bin_addr + roundup(tile_state_size, 4096);
>  
>  	*(uint8_t *)(validated + 14) =
>  		((flags & ~(VC4_BIN_CONFIG_ALLOC_INIT_BLOCK_SIZE_MASK |
> @@ -394,35 +410,13 @@ validate_tile_binning_config(VALIDATE_ARGS)
>  		 VC4_SET_FIELD(VC4_BIN_CONFIG_ALLOC_BLOCK_SIZE_128,
>  			       VC4_BIN_CONFIG_ALLOC_BLOCK_SIZE));
>  
> -	/* Initial block size. */
> -	tile_alloc_size = 32 * tile_count;
> -
> -	/*
> -	 * The initial allocation gets rounded to the next 256 bytes before
> -	 * the hardware starts fulfilling further allocations.
> -	 */
> -	tile_alloc_size = roundup(tile_alloc_size, 256);
> -
> -	/* Add space for the extra allocations.  This is what gets used first,
> -	 * before overflow memory.  It must have at least 4096 bytes, but we
> -	 * want to avoid overflow memory usage if possible.
> -	 */
> -	tile_alloc_size += 1024 * 1024;
> -
> -	tile_bo = vc4_bo_create(dev, exec->tile_alloc_offset + tile_alloc_size,
> -				true);
> -	exec->tile_bo = &tile_bo->base;
> -	if (IS_ERR(exec->tile_bo))
> -		return PTR_ERR(exec->tile_bo);
> -	list_add_tail(&tile_bo->unref_head, &exec->unref_list);
> -
>  	/* tile alloc address. */
> -	*(uint32_t *)(validated + 0) = (exec->tile_bo->paddr +
> -					exec->tile_alloc_offset);
> +	*(uint32_t *)(validated + 0) = exec->tile_alloc_offset;
>  	/* tile alloc size. */
> -	*(uint32_t *)(validated + 4) = tile_alloc_size;
> +	*(uint32_t *)(validated + 4) = (bin_addr + vc4->bin_alloc_size -
> +					exec->tile_alloc_offset);
>  	/* tile state address. */
> -	*(uint32_t *)(validated + 8) = exec->tile_bo->paddr;
> +	*(uint32_t *)(validated + 8) = bin_addr;
>  
>  	return 0;
>  }