[drm-next] drm/radeon: use IBs for VM page table updates

Christian König deathsimple at vodafone.de
Fri Feb 1 02:25:19 PST 2013


Am 31.01.2013 22:37, schrieb alexdeucher at gmail.com:
> From: Alex Deucher <alexander.deucher at amd.com>
>
> For very large page table updates, we can exceed the
> size of the ring.  To avoid this, use an IB to perform
> the page table update.
>
> Signed-off-by: Alex Deucher <alexander.deucher at amd.com>
> ---
>
> We may want to do something similar for the BO move code as we
> could potentially run out of ring space on a small ring like the
> DMA rings if we are moving very large buffers.
>
> Alex
>
>   drivers/gpu/drm/radeon/ni.c          |   30 +++++++++++--------
>   drivers/gpu/drm/radeon/radeon.h      |    6 ++-
>   drivers/gpu/drm/radeon/radeon_asic.h |    8 ++++-
>   drivers/gpu/drm/radeon/radeon_gart.c |   45 +++++++++++++++++++++++------
>   drivers/gpu/drm/radeon/si.c          |   52 ++++++++++++++++++---------------
>   5 files changed, 90 insertions(+), 51 deletions(-)
>
> diff --git a/drivers/gpu/drm/radeon/ni.c b/drivers/gpu/drm/radeon/ni.c
> index 170bd03..7cead76 100644
> --- a/drivers/gpu/drm/radeon/ni.c
> +++ b/drivers/gpu/drm/radeon/ni.c
> @@ -1946,19 +1946,21 @@ uint32_t cayman_vm_page_flags(struct radeon_device *rdev, uint32_t flags)
>    * cayman_vm_set_page - update the page tables using the CP
>    *
>    * @rdev: radeon_device pointer
> + * @ib: indirect buffer to fill with commands
>    * @pe: addr of the page entry
>    * @addr: dst addr to write into pe
>    * @count: number of page entries to update
>    * @incr: increase next addr by incr bytes
>    * @flags: access flags
>    *
> - * Update the page tables using the CP (cayman-si).
> + * Update the page tables using the CP (cayman/TN).
>    */
> -void cayman_vm_set_page(struct radeon_device *rdev, uint64_t pe,
> +void cayman_vm_set_page(struct radeon_device *rdev,
> +			struct radeon_ib *ib,
> +			uint64_t pe,
>   			uint64_t addr, unsigned count,
>   			uint32_t incr, uint32_t flags)
>   {
> -	struct radeon_ring *ring = &rdev->ring[rdev->asic->vm.pt_ring_index];
>   	uint32_t r600_flags = cayman_vm_page_flags(rdev, flags);
>   	uint64_t value;
>   	unsigned ndw;
> @@ -1969,9 +1971,9 @@ void cayman_vm_set_page(struct radeon_device *rdev, uint64_t pe,
>   			if (ndw > 0x3FFF)
>   				ndw = 0x3FFF;
>   
> -			radeon_ring_write(ring, PACKET3(PACKET3_ME_WRITE, ndw));
> -			radeon_ring_write(ring, pe);
> -			radeon_ring_write(ring, upper_32_bits(pe) & 0xff);
> +			ib->ptr[ib->length_dw++] = PACKET3(PACKET3_ME_WRITE, ndw);
> +			ib->ptr[ib->length_dw++] = pe;
> +			ib->ptr[ib->length_dw++] = upper_32_bits(pe) & 0xff;
>   			for (; ndw > 1; ndw -= 2, --count, pe += 8) {
>   				if (flags & RADEON_VM_PAGE_SYSTEM) {
>   					value = radeon_vm_map_gart(rdev, addr);
> @@ -1983,8 +1985,8 @@ void cayman_vm_set_page(struct radeon_device *rdev, uint64_t pe,
>   				}
>   				addr += incr;
>   				value |= r600_flags;
> -				radeon_ring_write(ring, value);
> -				radeon_ring_write(ring, upper_32_bits(value));
> +				ib->ptr[ib->length_dw++] = value;
> +				ib->ptr[ib->length_dw++] = upper_32_bits(value);
>   			}
>   		}
>   	} else {
> @@ -1994,9 +1996,9 @@ void cayman_vm_set_page(struct radeon_device *rdev, uint64_t pe,
>   				ndw = 0xFFFFE;
>   
>   			/* for non-physically contiguous pages (system) */
> -			radeon_ring_write(ring, DMA_PACKET(DMA_PACKET_WRITE, 0, 0, ndw));
> -			radeon_ring_write(ring, pe);
> -			radeon_ring_write(ring, upper_32_bits(pe) & 0xff);
> +			ib->ptr[ib->length_dw++] = DMA_PACKET(DMA_PACKET_WRITE, 0, 0, ndw);
> +			ib->ptr[ib->length_dw++] = pe;
> +			ib->ptr[ib->length_dw++] = upper_32_bits(pe) & 0xff;
>   			for (; ndw > 0; ndw -= 2, --count, pe += 8) {
>   				if (flags & RADEON_VM_PAGE_SYSTEM) {
>   					value = radeon_vm_map_gart(rdev, addr);
> @@ -2008,10 +2010,12 @@ void cayman_vm_set_page(struct radeon_device *rdev, uint64_t pe,
>   				}
>   				addr += incr;
>   				value |= r600_flags;
> -				radeon_ring_write(ring, value);
> -				radeon_ring_write(ring, upper_32_bits(value));
> +				ib->ptr[ib->length_dw++] = value;
> +				ib->ptr[ib->length_dw++] = upper_32_bits(value);
>   			}
>   		}
> +		while (ib->length_dw & 0x7)
> +			ib->ptr[ib->length_dw++] = DMA_PACKET(DMA_PACKET_NOP, 0, 0, 0);
>   	}
>   }
>   
> diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h
> index 6539d6c..89b9645 100644
> --- a/drivers/gpu/drm/radeon/radeon.h
> +++ b/drivers/gpu/drm/radeon/radeon.h
> @@ -1188,7 +1188,9 @@ struct radeon_asic {
>   		void (*fini)(struct radeon_device *rdev);
>   
>   		u32 pt_ring_index;
> -		void (*set_page)(struct radeon_device *rdev, uint64_t pe,
> +		void (*set_page)(struct radeon_device *rdev,
> +				 struct radeon_ib *ib,
> +				 uint64_t pe,
>   				 uint64_t addr, unsigned count,
>   				 uint32_t incr, uint32_t flags);
>   	} vm;
> @@ -1810,7 +1812,7 @@ void radeon_ring_write(struct radeon_ring *ring, uint32_t v);
>   #define radeon_gart_set_page(rdev, i, p) (rdev)->asic->gart.set_page((rdev), (i), (p))
>   #define radeon_asic_vm_init(rdev) (rdev)->asic->vm.init((rdev))
>   #define radeon_asic_vm_fini(rdev) (rdev)->asic->vm.fini((rdev))
> -#define radeon_asic_vm_set_page(rdev, pe, addr, count, incr, flags) ((rdev)->asic->vm.set_page((rdev), (pe), (addr), (count), (incr), (flags)))
> +#define radeon_asic_vm_set_page(rdev, ib, pe, addr, count, incr, flags) ((rdev)->asic->vm.set_page((rdev), (ib), (pe), (addr), (count), (incr), (flags)))
>   #define radeon_ring_start(rdev, r, cp) (rdev)->asic->ring[(r)].ring_start((rdev), (cp))
>   #define radeon_ring_test(rdev, r, cp) (rdev)->asic->ring[(r)].ring_test((rdev), (cp))
>   #define radeon_ib_test(rdev, r, cp) (rdev)->asic->ring[(r)].ib_test((rdev), (cp))
> diff --git a/drivers/gpu/drm/radeon/radeon_asic.h b/drivers/gpu/drm/radeon/radeon_asic.h
> index e429e25..f4134a8 100644
> --- a/drivers/gpu/drm/radeon/radeon_asic.h
> +++ b/drivers/gpu/drm/radeon/radeon_asic.h
> @@ -474,7 +474,9 @@ int cayman_vm_init(struct radeon_device *rdev);
>   void cayman_vm_fini(struct radeon_device *rdev);
>   void cayman_vm_flush(struct radeon_device *rdev, int ridx, struct radeon_vm *vm);
>   uint32_t cayman_vm_page_flags(struct radeon_device *rdev, uint32_t flags);
> -void cayman_vm_set_page(struct radeon_device *rdev, uint64_t pe,
> +void cayman_vm_set_page(struct radeon_device *rdev,
> +			struct radeon_ib *ib,
> +			uint64_t pe,
>   			uint64_t addr, unsigned count,
>   			uint32_t incr, uint32_t flags);
>   int evergreen_ib_parse(struct radeon_device *rdev, struct radeon_ib *ib);
> @@ -506,7 +508,9 @@ int si_irq_set(struct radeon_device *rdev);
>   int si_irq_process(struct radeon_device *rdev);
>   int si_vm_init(struct radeon_device *rdev);
>   void si_vm_fini(struct radeon_device *rdev);
> -void si_vm_set_page(struct radeon_device *rdev, uint64_t pe,
> +void si_vm_set_page(struct radeon_device *rdev,
> +		    struct radeon_ib *ib,
> +		    uint64_t pe,
>   		    uint64_t addr, unsigned count,
>   		    uint32_t incr, uint32_t flags);
>   void si_vm_flush(struct radeon_device *rdev, int ridx, struct radeon_vm *vm);
> diff --git a/drivers/gpu/drm/radeon/radeon_gart.c b/drivers/gpu/drm/radeon/radeon_gart.c
> index 6e24f84..d95c61d 100644
> --- a/drivers/gpu/drm/radeon/radeon_gart.c
> +++ b/drivers/gpu/drm/radeon/radeon_gart.c
> @@ -929,6 +929,7 @@ uint64_t radeon_vm_map_gart(struct radeon_device *rdev, uint64_t addr)
>    */
>   static int radeon_vm_update_pdes(struct radeon_device *rdev,
>   				 struct radeon_vm *vm,
> +				 struct radeon_ib *ib,
>   				 uint64_t start, uint64_t end)
>   {
>   	static const uint32_t incr = RADEON_VM_PTE_COUNT * 8;
> @@ -971,7 +972,7 @@ retry:
>   		    ((last_pt + incr * count) != pt)) {
>   
>   			if (count) {
> -				radeon_asic_vm_set_page(rdev, last_pde,
> +				radeon_asic_vm_set_page(rdev, ib, last_pde,
>   							last_pt, count, incr,
>   							RADEON_VM_PAGE_VALID);
>   			}
> @@ -985,7 +986,7 @@ retry:
>   	}
>   
>   	if (count) {
> -		radeon_asic_vm_set_page(rdev, last_pde, last_pt, count,
> +		radeon_asic_vm_set_page(rdev, ib, last_pde, last_pt, count,
>   					incr, RADEON_VM_PAGE_VALID);
>   
>   	}
> @@ -1009,6 +1010,7 @@ retry:
>    */
>   static void radeon_vm_update_ptes(struct radeon_device *rdev,
>   				  struct radeon_vm *vm,
> +				  struct radeon_ib *ib,
>   				  uint64_t start, uint64_t end,
>   				  uint64_t dst, uint32_t flags)
>   {
> @@ -1038,7 +1040,7 @@ static void radeon_vm_update_ptes(struct radeon_device *rdev,
>   		if ((last_pte + 8 * count) != pte) {
>   
>   			if (count) {
> -				radeon_asic_vm_set_page(rdev, last_pte,
> +				radeon_asic_vm_set_page(rdev, ib, last_pte,
>   							last_dst, count,
>   							RADEON_GPU_PAGE_SIZE,
>   							flags);
> @@ -1056,7 +1058,8 @@ static void radeon_vm_update_ptes(struct radeon_device *rdev,
>   	}
>   
>   	if (count) {
> -		radeon_asic_vm_set_page(rdev, last_pte,	last_dst, count,
> +		radeon_asic_vm_set_page(rdev, ib, last_pte,
> +					last_dst, count,
>   					RADEON_GPU_PAGE_SIZE, flags);
>   	}
>   }
> @@ -1082,6 +1085,7 @@ int radeon_vm_bo_update_pte(struct radeon_device *rdev,
>   	unsigned ridx = rdev->asic->vm.pt_ring_index;
>   	struct radeon_ring *ring = &rdev->ring[ridx];
>   	struct radeon_semaphore *sem = NULL;
> +	struct radeon_ib ib;
>   	struct radeon_bo_va *bo_va;
>   	unsigned nptes, npdes, ndw;
>   	uint64_t addr;
> @@ -1140,9 +1144,8 @@ int radeon_vm_bo_update_pte(struct radeon_device *rdev,
>   	/* assume two extra pdes in case the mapping overlaps the borders */
>   	npdes = (nptes >> RADEON_VM_BLOCK_SIZE) + 2;
>   
> -	/* estimate number of dw needed */
> -	/* semaphore, fence and padding */
> -	ndw = 32;
> +	/* padding, etc. */
> +	ndw = 64;
>   
>   	if (RADEON_VM_BLOCK_SIZE > 11)
>   		/* reserve space for one header for every 2k dwords */
> @@ -1161,8 +1164,25 @@ int radeon_vm_bo_update_pte(struct radeon_device *rdev,
>   	/* reserve space for pde addresses */
>   	ndw += npdes * 2;
>   
> -	r = radeon_ring_lock(rdev, ring, ndw);
> +	/* update too big for an IB */
> +	if (ndw > 0xfffff)
> +		return -ENOMEM;
> +
> +	r = radeon_sa_bo_new(rdev, &rdev->ring_tmp_bo, &ib.sa_bo, ndw * 4, 256, true);
> +	if (r) {
> +		return r;
> +	}
> +	ib.ring = ridx;
> +	ib.fence = NULL;
> +	ib.ptr = radeon_sa_bo_cpu_addr(ib.sa_bo);
> +	ib.vm = NULL;
> +	ib.gpu_addr = radeon_sa_bo_gpu_addr(ib.sa_bo);
> +	ib.is_const_ib = false;
> +	ib.length_dw = 0;

Wouldn't it make sense to use the IB functions (ib_get, ib_schedule, 
ib_free) here instead of filling the IB structure manually?

Apart from that it looks quite good.

Christian.

> +
> +	r = radeon_ring_lock(rdev, ring, 64);
>   	if (r) {
> +		radeon_sa_bo_free(rdev, &ib.sa_bo, NULL);
>   		return r;
>   	}
>   
> @@ -1171,23 +1191,28 @@ int radeon_vm_bo_update_pte(struct radeon_device *rdev,
>   		radeon_fence_note_sync(vm->fence, ridx);
>   	}
>   
> -	r = radeon_vm_update_pdes(rdev, vm, bo_va->soffset, bo_va->eoffset);
> +	r = radeon_vm_update_pdes(rdev, vm, &ib, bo_va->soffset, bo_va->eoffset);
>   	if (r) {
> +		radeon_sa_bo_free(rdev, &ib.sa_bo, NULL);
>   		radeon_ring_unlock_undo(rdev, ring);
>   		return r;
>   	}
>   
> -	radeon_vm_update_ptes(rdev, vm, bo_va->soffset, bo_va->eoffset,
> +	radeon_vm_update_ptes(rdev, vm, &ib, bo_va->soffset, bo_va->eoffset,
>   			      addr, bo_va->flags);
>   
> +	radeon_ring_ib_execute(rdev, ib.ring, &ib);
> +
>   	radeon_fence_unref(&vm->fence);
>   	r = radeon_fence_emit(rdev, &vm->fence, ridx);
>   	if (r) {
> +		radeon_sa_bo_free(rdev, &ib.sa_bo, NULL);
>   		radeon_ring_unlock_undo(rdev, ring);
>   		return r;
>   	}
>   	radeon_ring_unlock_commit(rdev, ring);
>   	radeon_semaphore_free(rdev, &sem, vm->fence);
> +	radeon_sa_bo_free(rdev, &ib.sa_bo, vm->fence);
>   	radeon_fence_unref(&vm->last_flush);
>   
>   	return 0;
> diff --git a/drivers/gpu/drm/radeon/si.c b/drivers/gpu/drm/radeon/si.c
> index cd83bc5..a910cb9 100644
> --- a/drivers/gpu/drm/radeon/si.c
> +++ b/drivers/gpu/drm/radeon/si.c
> @@ -3043,19 +3043,21 @@ void si_vm_fini(struct radeon_device *rdev)
>    * si_vm_set_page - update the page tables using the CP
>    *
>    * @rdev: radeon_device pointer
> + * @ib: indirect buffer to fill with commands
>    * @pe: addr of the page entry
>    * @addr: dst addr to write into pe
>    * @count: number of page entries to update
>    * @incr: increase next addr by incr bytes
>    * @flags: access flags
>    *
> - * Update the page tables using the CP (cayman-si).
> + * Update the page tables using the CP (SI).
>    */
> -void si_vm_set_page(struct radeon_device *rdev, uint64_t pe,
> +void si_vm_set_page(struct radeon_device *rdev,
> +		    struct radeon_ib *ib,
> +		    uint64_t pe,
>   		    uint64_t addr, unsigned count,
>   		    uint32_t incr, uint32_t flags)
>   {
> -	struct radeon_ring *ring = &rdev->ring[rdev->asic->vm.pt_ring_index];
>   	uint32_t r600_flags = cayman_vm_page_flags(rdev, flags);
>   	uint64_t value;
>   	unsigned ndw;
> @@ -3066,11 +3068,11 @@ void si_vm_set_page(struct radeon_device *rdev, uint64_t pe,
>   			if (ndw > 0x3FFE)
>   				ndw = 0x3FFE;
>   
> -			radeon_ring_write(ring, PACKET3(PACKET3_WRITE_DATA, ndw));
> -			radeon_ring_write(ring, (WRITE_DATA_ENGINE_SEL(0) |
> -						 WRITE_DATA_DST_SEL(1)));
> -			radeon_ring_write(ring, pe);
> -			radeon_ring_write(ring, upper_32_bits(pe));
> +			ib->ptr[ib->length_dw++] = PACKET3(PACKET3_WRITE_DATA, ndw);
> +			ib->ptr[ib->length_dw++] = (WRITE_DATA_ENGINE_SEL(0) |
> +					WRITE_DATA_DST_SEL(1));
> +			ib->ptr[ib->length_dw++] = pe;
> +			ib->ptr[ib->length_dw++] = upper_32_bits(pe);
>   			for (; ndw > 2; ndw -= 2, --count, pe += 8) {
>   				if (flags & RADEON_VM_PAGE_SYSTEM) {
>   					value = radeon_vm_map_gart(rdev, addr);
> @@ -3082,8 +3084,8 @@ void si_vm_set_page(struct radeon_device *rdev, uint64_t pe,
>   				}
>   				addr += incr;
>   				value |= r600_flags;
> -				radeon_ring_write(ring, value);
> -				radeon_ring_write(ring, upper_32_bits(value));
> +				ib->ptr[ib->length_dw++] = value;
> +				ib->ptr[ib->length_dw++] = upper_32_bits(value);
>   			}
>   		}
>   	} else {
> @@ -3095,9 +3097,9 @@ void si_vm_set_page(struct radeon_device *rdev, uint64_t pe,
>   					ndw = 0xFFFFE;
>   
>   				/* for non-physically contiguous pages (system) */
> -				radeon_ring_write(ring, DMA_PACKET(DMA_PACKET_WRITE, 0, 0, 0, ndw));
> -				radeon_ring_write(ring, pe);
> -				radeon_ring_write(ring, upper_32_bits(pe) & 0xff);
> +				ib->ptr[ib->length_dw++] = DMA_PACKET(DMA_PACKET_WRITE, 0, 0, 0, ndw);
> +				ib->ptr[ib->length_dw++] = pe;
> +				ib->ptr[ib->length_dw++] = upper_32_bits(pe) & 0xff;
>   				for (; ndw > 0; ndw -= 2, --count, pe += 8) {
>   					if (flags & RADEON_VM_PAGE_SYSTEM) {
>   						value = radeon_vm_map_gart(rdev, addr);
> @@ -3109,8 +3111,8 @@ void si_vm_set_page(struct radeon_device *rdev, uint64_t pe,
>   					}
>   					addr += incr;
>   					value |= r600_flags;
> -					radeon_ring_write(ring, value);
> -					radeon_ring_write(ring, upper_32_bits(value));
> +					ib->ptr[ib->length_dw++] = value;
> +					ib->ptr[ib->length_dw++] = upper_32_bits(value);
>   				}
>   			}
>   		} else {
> @@ -3124,20 +3126,22 @@ void si_vm_set_page(struct radeon_device *rdev, uint64_t pe,
>   				else
>   					value = 0;
>   				/* for physically contiguous pages (vram) */
> -				radeon_ring_write(ring, DMA_PTE_PDE_PACKET(ndw));
> -				radeon_ring_write(ring, pe); /* dst addr */
> -				radeon_ring_write(ring, upper_32_bits(pe) & 0xff);
> -				radeon_ring_write(ring, r600_flags); /* mask */
> -				radeon_ring_write(ring, 0);
> -				radeon_ring_write(ring, value); /* value */
> -				radeon_ring_write(ring, upper_32_bits(value));
> -				radeon_ring_write(ring, incr); /* increment size */
> -				radeon_ring_write(ring, 0);
> +				ib->ptr[ib->length_dw++] = DMA_PTE_PDE_PACKET(ndw);
> +				ib->ptr[ib->length_dw++] = pe; /* dst addr */
> +				ib->ptr[ib->length_dw++] = upper_32_bits(pe) & 0xff;
> +				ib->ptr[ib->length_dw++] = r600_flags; /* mask */
> +				ib->ptr[ib->length_dw++] = 0;
> +				ib->ptr[ib->length_dw++] = value; /* value */
> +				ib->ptr[ib->length_dw++] = upper_32_bits(value);
> +				ib->ptr[ib->length_dw++] = incr; /* increment size */
> +				ib->ptr[ib->length_dw++] = 0;
>   				pe += ndw * 4;
>   				addr += (ndw / 2) * incr;
>   				count -= ndw / 2;
>   			}
>   		}
> +		while (ib->length_dw & 0x7)
> +			ib->ptr[ib->length_dw++] = DMA_PACKET(DMA_PACKET_NOP, 0, 0, 0, 0);
>   	}
>   }
>   



More information about the dri-devel mailing list