[PATCH 1/2] drm/radeon: add large PTE support for NI, SI and CIK v4

Jay Cornwall jay at jcornwall.me
Thu May 1 10:29:08 PDT 2014


On 2014-05-01 11:52, Christian König wrote:

Some minor comment fixes inline. I've been using v3 of this patch on SI 
for quite a while, with no visible failures.

Thanks for pushing this.

> From: Christian König <christian.koenig at amd.com>
> 
> This patch implements support for VRAM page table entry compression.
> PTE construction is enhanced to identify physically contiguous page
> ranges and mark them in the PTE fragment field. L1 TLB and L2 cache
                                                   ^^^^^^^^^^^^^^^^^^^
This should read L1/L2 TLB. HW spec refers to the L2 TLB as the VM L2 
"cache", which confused the draft comments.

> support is enabled for 64KB (SI/CIK) and 256KB (NI) PTE fragments,
> significantly improving TLB utilization for VRAM allocations.
> 
> Linear store bandwidth is improved from 60GB/s to 125GB/s on Pitcairn.
> Unigine Heaven 3.0 sees an average improvement from 24.7 to 27.7 FPS
> on default settings at 1920x1200 resolution with vsync disabled.
> 
> See main comment in radeon_gart.c gives a technical description.
> 
> v2 (chk): rebased and simplified.
> v3 (chk): add missing hw setup
> v4 (chk): rebased on current drm-fixes-3.15
> 
> Signed-off-by: Jay Cornwall <jay at jcornwall.me>
> Signed-off-by: Christian König <christian.koenig at amd.com>
> ---
>  drivers/gpu/drm/radeon/cik.c       |  4 +-
>  drivers/gpu/drm/radeon/ni.c        |  2 +
>  drivers/gpu/drm/radeon/radeon.h    |  5 +++
>  drivers/gpu/drm/radeon/radeon_vm.c | 91 
> +++++++++++++++++++++++++++++++++++---
>  drivers/gpu/drm/radeon/si.c        |  5 ++-
>  5 files changed, 98 insertions(+), 9 deletions(-)
> 
> diff --git a/drivers/gpu/drm/radeon/cik.c 
> b/drivers/gpu/drm/radeon/cik.c
> index 38f3fcc..38da9f3 100644
> --- a/drivers/gpu/drm/radeon/cik.c
> +++ b/drivers/gpu/drm/radeon/cik.c
> @@ -5395,6 +5395,7 @@ static int cik_pcie_gart_enable(struct
> radeon_device *rdev)
>  	WREG32(MC_VM_MX_L1_TLB_CNTL,
>  	       (0xA << 7) |
>  	       ENABLE_L1_TLB |
> +	       ENABLE_L1_FRAGMENT_PROCESSING |
>  	       SYSTEM_ACCESS_MODE_NOT_IN_SYS |
>  	       ENABLE_ADVANCED_DRIVER_MODEL |
>  	       SYSTEM_APERTURE_UNMAPPED_ACCESS_PASS_THRU);
> @@ -5407,7 +5408,8 @@ static int cik_pcie_gart_enable(struct
> radeon_device *rdev)
>  	       CONTEXT1_IDENTITY_ACCESS_MODE(1));
>  	WREG32(VM_L2_CNTL2, INVALIDATE_ALL_L1_TLBS | INVALIDATE_L2_CACHE);
>  	WREG32(VM_L2_CNTL3, L2_CACHE_BIGK_ASSOCIATIVITY |
> -	       L2_CACHE_BIGK_FRAGMENT_SIZE(6));
> +	       BANK_SELECT(4) |
> +	       L2_CACHE_BIGK_FRAGMENT_SIZE(4));
>  	/* setup context0 */
>  	WREG32(VM_CONTEXT0_PAGE_TABLE_START_ADDR, rdev->mc.gtt_start >> 12);
>  	WREG32(VM_CONTEXT0_PAGE_TABLE_END_ADDR, rdev->mc.gtt_end >> 12);
> diff --git a/drivers/gpu/drm/radeon/ni.c b/drivers/gpu/drm/radeon/ni.c
> index d246e04..5e8db9b 100644
> --- a/drivers/gpu/drm/radeon/ni.c
> +++ b/drivers/gpu/drm/radeon/ni.c
> @@ -1228,12 +1228,14 @@ static int cayman_pcie_gart_enable(struct
> radeon_device *rdev)
>  	       SYSTEM_APERTURE_UNMAPPED_ACCESS_PASS_THRU);
>  	/* Setup L2 cache */
>  	WREG32(VM_L2_CNTL, ENABLE_L2_CACHE |
> +	       ENABLE_L2_FRAGMENT_PROCESSING |
>  	       ENABLE_L2_PTE_CACHE_LRU_UPDATE_BY_WRITE |
>  	       ENABLE_L2_PDE0_CACHE_LRU_UPDATE_BY_WRITE |
>  	       EFFECTIVE_L2_QUEUE_SIZE(7) |
>  	       CONTEXT1_IDENTITY_ACCESS_MODE(1));
>  	WREG32(VM_L2_CNTL2, INVALIDATE_ALL_L1_TLBS | INVALIDATE_L2_CACHE);
>  	WREG32(VM_L2_CNTL3, L2_CACHE_BIGK_ASSOCIATIVITY |
> +	       BANK_SELECT(6) |
>  	       L2_CACHE_BIGK_FRAGMENT_SIZE(6));
>  	/* setup context0 */
>  	WREG32(VM_CONTEXT0_PAGE_TABLE_START_ADDR, rdev->mc.gtt_start >> 12);
> diff --git a/drivers/gpu/drm/radeon/radeon.h 
> b/drivers/gpu/drm/radeon/radeon.h
> index 6852861..e3d6be3 100644
> --- a/drivers/gpu/drm/radeon/radeon.h
> +++ b/drivers/gpu/drm/radeon/radeon.h
> @@ -854,6 +854,11 @@ struct radeon_mec {
>  #define R600_PTE_READABLE	(1 << 5)
>  #define R600_PTE_WRITEABLE	(1 << 6)
> 
> +/* PTE (Page Table Entry) fragment field for different page sizes */
> +#define R600_PTE_FRAG_4KB	(0 << 7)
> +#define R600_PTE_FRAG_64KB	(4 << 7)
> +#define R600_PTE_FRAG_256KB	(6 << 7)
> +
>  struct radeon_vm_pt {
>  	struct radeon_bo		*bo;
>  	uint64_t			addr;
> diff --git a/drivers/gpu/drm/radeon/radeon_vm.c
> b/drivers/gpu/drm/radeon/radeon_vm.c
> index 2aae6ce..6bf656e 100644
> --- a/drivers/gpu/drm/radeon/radeon_vm.c
> +++ b/drivers/gpu/drm/radeon/radeon_vm.c
> @@ -658,6 +658,84 @@ int radeon_vm_update_page_directory(struct
> radeon_device *rdev,
>  }
> 
>  /**
> + * radeon_vm_frag_ptes - add fragment information to PTEs
> + *
> + * @rdev: radeon_device pointer
> + * @ib: IB for the update
> + * @pe_start: first PTE to handle
> + * @pe_end: last PTE to handle
> + * @addr: addr those PTEs should point to
> + * @flags: hw mapping flags
> + *
> + * Global and local mutex must be locked!
> + */
> +static void radeon_vm_frag_ptes(struct radeon_device *rdev,
> +				struct radeon_ib *ib,
> +				uint64_t pe_start, uint64_t pe_end,
> +				uint64_t addr, uint32_t flags)
> +{
> +	/**
> +	 * The MC L1 TLB supports variable sized pages, based on a fragment
> +	 * field in the PTE. When this field is set to a non-zero value, page
> +	 * granularity is increased from 4KB to (1 << (12 + frag)). The PTE
> +	 * flags are considered valid for all PTEs within the fragment range
> +	 * and corresponding mappings are assumed to be physically 
> contiguous.
> +	 *
> +	 * The L1 TLB can store a single PTE for the whole fragment,
> +	 * significantly increasing the space available for translation
> +	 * caching. This leads to large improvements in throughput when the
> +	 * TLB is under pressure.
> +	 *
> +	 * The L2 cache distributes small and large fragments into two
                   ^^^^^
Again, L2 TLB.

> +	 * asymmetric partitions. The large fragment cache is significantly
> +	 * larger. Thus, we try to use large fragments wherever possible.
> +	 * Userspace can support this by aligning virtual base address and
> +	 * allocation size to the fragment size.
> +	 */
> +
> +	/* NI is optimized for 256KB fragments, SI and newer for 64KB */
> +	uint64_t frag_flags = rdev->family == CHIP_CAYMAN ?
> +			R600_PTE_FRAG_256KB : R600_PTE_FRAG_64KB;
> +	uint64_t frag_align = rdev->family == CHIP_CAYMAN ? 0x200 : 0x80;
> +
> +	uint64_t frag_start = ALIGN(pe_start, frag_align);
> +	uint64_t frag_end = pe_end & ~(frag_align - 1);
> +
> +	unsigned count;
> +
> +	/* system pages are non continuously */
> +	if ((flags & R600_PTE_SYSTEM) || !(flags & R600_PTE_VALID) ||
> +	    (frag_start >= frag_end)) {
> +
> +		count = (pe_end - pe_start) / 8;
> +		radeon_asic_vm_set_page(rdev, ib, pe_start, addr, count,
> +					RADEON_GPU_PAGE_SIZE, flags);
> +		return;
> +	}
> +
> +	/* handle the 4K area at the beginning */
> +	if (pe_start != frag_start) {
> +		count = (frag_start - pe_start) / 8;
> +		radeon_asic_vm_set_page(rdev, ib, pe_start, addr, count,
> +					RADEON_GPU_PAGE_SIZE, flags);
> +		addr += RADEON_GPU_PAGE_SIZE * count;
> +	}
> +
> +	/* handle the area in the middle */
> +	count = (frag_end - frag_start) / 8;
> +	radeon_asic_vm_set_page(rdev, ib, frag_start, addr, count,
> +				RADEON_GPU_PAGE_SIZE, flags | frag_flags);
> +
> +	/* handle the 4K area at the end */
> +	if (frag_end != pe_end) {
> +		addr += RADEON_GPU_PAGE_SIZE * count;
> +		count = (pe_end - frag_end) / 8;
> +		radeon_asic_vm_set_page(rdev, ib, frag_end, addr, count,
> +					RADEON_GPU_PAGE_SIZE, flags);
> +	}
> +}
> +
> +/**
>   * radeon_vm_update_ptes - make sure that page tables are valid
>   *
>   * @rdev: radeon_device pointer
> @@ -703,10 +781,9 @@ static void radeon_vm_update_ptes(struct
> radeon_device *rdev,
>  		if ((last_pte + 8 * count) != pte) {
> 
>  			if (count) {
> -				radeon_asic_vm_set_page(rdev, ib, last_pte,
> -							last_dst, count,
> -							RADEON_GPU_PAGE_SIZE,
> -							flags);
> +				radeon_vm_frag_ptes(rdev, ib, last_pte,
> +						    last_pte + 8 * count,
> +						    last_dst, flags);
>  			}
> 
>  			count = nptes;
> @@ -721,9 +798,9 @@ static void radeon_vm_update_ptes(struct
> radeon_device *rdev,
>  	}
> 
>  	if (count) {
> -		radeon_asic_vm_set_page(rdev, ib, last_pte,
> -					last_dst, count,
> -					RADEON_GPU_PAGE_SIZE, flags);
> +		radeon_vm_frag_ptes(rdev, ib, last_pte,
> +				    last_pte + 8 * count,
> +				    last_dst, flags);
>  	}
>  }
> 
> diff --git a/drivers/gpu/drm/radeon/si.c b/drivers/gpu/drm/radeon/si.c
> index 22a63c9..dece3be 100644
> --- a/drivers/gpu/drm/radeon/si.c
> +++ b/drivers/gpu/drm/radeon/si.c
> @@ -4044,18 +4044,21 @@ static int si_pcie_gart_enable(struct
> radeon_device *rdev)
>  	WREG32(MC_VM_MX_L1_TLB_CNTL,
>  	       (0xA << 7) |
>  	       ENABLE_L1_TLB |
> +	       ENABLE_L1_FRAGMENT_PROCESSING |
>  	       SYSTEM_ACCESS_MODE_NOT_IN_SYS |
>  	       ENABLE_ADVANCED_DRIVER_MODEL |
>  	       SYSTEM_APERTURE_UNMAPPED_ACCESS_PASS_THRU);
>  	/* Setup L2 cache */
>  	WREG32(VM_L2_CNTL, ENABLE_L2_CACHE |
> +	       ENABLE_L2_FRAGMENT_PROCESSING |
>  	       ENABLE_L2_PTE_CACHE_LRU_UPDATE_BY_WRITE |
>  	       ENABLE_L2_PDE0_CACHE_LRU_UPDATE_BY_WRITE |
>  	       EFFECTIVE_L2_QUEUE_SIZE(7) |
>  	       CONTEXT1_IDENTITY_ACCESS_MODE(1));
>  	WREG32(VM_L2_CNTL2, INVALIDATE_ALL_L1_TLBS | INVALIDATE_L2_CACHE);
>  	WREG32(VM_L2_CNTL3, L2_CACHE_BIGK_ASSOCIATIVITY |
> -	       L2_CACHE_BIGK_FRAGMENT_SIZE(0));
> +	       BANK_SELECT(4) |
> +	       L2_CACHE_BIGK_FRAGMENT_SIZE(4));
>  	/* setup context0 */
>  	WREG32(VM_CONTEXT0_PAGE_TABLE_START_ADDR, rdev->mc.gtt_start >> 12);
>  	WREG32(VM_CONTEXT0_PAGE_TABLE_END_ADDR, rdev->mc.gtt_end >> 12);


More information about the dri-devel mailing list