[PATCH 1/2] drm/radeon: add large PTE support for NI, SI and CIK v4
Christian König
deathsimple at vodafone.de
Fri May 2 05:13:16 PDT 2014
Am 01.05.2014 19:29, schrieb Jay Cornwall:
> On 2014-05-01 11:52, Christian König wrote:
>
> Some minor comment fixes inline. I've been using v3 of this patch on
> SI for quite a while, with no visible failures.
Thanks for the notes. I've added them to my v5 of the patch and also
updated the file name in the commit message (we have moved that stuff to
radeon_vm.c in the meantime).
Christian.
>
> Thanks for pushing this.
>
>> From: Christian König <christian.koenig at amd.com>
>>
>> This patch implements support for VRAM page table entry compression.
>> PTE construction is enhanced to identify physically contiguous page
>> ranges and mark them in the PTE fragment field. L1 TLB and L2 cache
> ^^^^^^^^^^^^^^^^^^^
> This should read L1/L2 TLB. HW spec refers to the L2 TLB as the VM L2
> "cache", which confused the draft comments.
>
>> support is enabled for 64KB (SI/CIK) and 256KB (NI) PTE fragments,
>> significantly improving TLB utilization for VRAM allocations.
>>
>> Linear store bandwidth is improved from 60GB/s to 125GB/s on Pitcairn.
>> Unigine Heaven 3.0 sees an average improvement from 24.7 to 27.7 FPS
>> on default settings at 1920x1200 resolution with vsync disabled.
>>
>> See main comment in radeon_gart.c gives a technical description.
>>
>> v2 (chk): rebased and simplified.
>> v3 (chk): add missing hw setup
>> v4 (chk): rebased on current drm-fixes-3.15
>>
>> Signed-off-by: Jay Cornwall <jay at jcornwall.me>
>> Signed-off-by: Christian König <christian.koenig at amd.com>
>> ---
>> drivers/gpu/drm/radeon/cik.c | 4 +-
>> drivers/gpu/drm/radeon/ni.c | 2 +
>> drivers/gpu/drm/radeon/radeon.h | 5 +++
>> drivers/gpu/drm/radeon/radeon_vm.c | 91
>> +++++++++++++++++++++++++++++++++++---
>> drivers/gpu/drm/radeon/si.c | 5 ++-
>> 5 files changed, 98 insertions(+), 9 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/radeon/cik.c b/drivers/gpu/drm/radeon/cik.c
>> index 38f3fcc..38da9f3 100644
>> --- a/drivers/gpu/drm/radeon/cik.c
>> +++ b/drivers/gpu/drm/radeon/cik.c
>> @@ -5395,6 +5395,7 @@ static int cik_pcie_gart_enable(struct
>> radeon_device *rdev)
>> WREG32(MC_VM_MX_L1_TLB_CNTL,
>> (0xA << 7) |
>> ENABLE_L1_TLB |
>> + ENABLE_L1_FRAGMENT_PROCESSING |
>> SYSTEM_ACCESS_MODE_NOT_IN_SYS |
>> ENABLE_ADVANCED_DRIVER_MODEL |
>> SYSTEM_APERTURE_UNMAPPED_ACCESS_PASS_THRU);
>> @@ -5407,7 +5408,8 @@ static int cik_pcie_gart_enable(struct
>> radeon_device *rdev)
>> CONTEXT1_IDENTITY_ACCESS_MODE(1));
>> WREG32(VM_L2_CNTL2, INVALIDATE_ALL_L1_TLBS | INVALIDATE_L2_CACHE);
>> WREG32(VM_L2_CNTL3, L2_CACHE_BIGK_ASSOCIATIVITY |
>> - L2_CACHE_BIGK_FRAGMENT_SIZE(6));
>> + BANK_SELECT(4) |
>> + L2_CACHE_BIGK_FRAGMENT_SIZE(4));
>> /* setup context0 */
>> WREG32(VM_CONTEXT0_PAGE_TABLE_START_ADDR, rdev->mc.gtt_start >>
>> 12);
>> WREG32(VM_CONTEXT0_PAGE_TABLE_END_ADDR, rdev->mc.gtt_end >> 12);
>> diff --git a/drivers/gpu/drm/radeon/ni.c b/drivers/gpu/drm/radeon/ni.c
>> index d246e04..5e8db9b 100644
>> --- a/drivers/gpu/drm/radeon/ni.c
>> +++ b/drivers/gpu/drm/radeon/ni.c
>> @@ -1228,12 +1228,14 @@ static int cayman_pcie_gart_enable(struct
>> radeon_device *rdev)
>> SYSTEM_APERTURE_UNMAPPED_ACCESS_PASS_THRU);
>> /* Setup L2 cache */
>> WREG32(VM_L2_CNTL, ENABLE_L2_CACHE |
>> + ENABLE_L2_FRAGMENT_PROCESSING |
>> ENABLE_L2_PTE_CACHE_LRU_UPDATE_BY_WRITE |
>> ENABLE_L2_PDE0_CACHE_LRU_UPDATE_BY_WRITE |
>> EFFECTIVE_L2_QUEUE_SIZE(7) |
>> CONTEXT1_IDENTITY_ACCESS_MODE(1));
>> WREG32(VM_L2_CNTL2, INVALIDATE_ALL_L1_TLBS | INVALIDATE_L2_CACHE);
>> WREG32(VM_L2_CNTL3, L2_CACHE_BIGK_ASSOCIATIVITY |
>> + BANK_SELECT(6) |
>> L2_CACHE_BIGK_FRAGMENT_SIZE(6));
>> /* setup context0 */
>> WREG32(VM_CONTEXT0_PAGE_TABLE_START_ADDR, rdev->mc.gtt_start >>
>> 12);
>> diff --git a/drivers/gpu/drm/radeon/radeon.h
>> b/drivers/gpu/drm/radeon/radeon.h
>> index 6852861..e3d6be3 100644
>> --- a/drivers/gpu/drm/radeon/radeon.h
>> +++ b/drivers/gpu/drm/radeon/radeon.h
>> @@ -854,6 +854,11 @@ struct radeon_mec {
>> #define R600_PTE_READABLE (1 << 5)
>> #define R600_PTE_WRITEABLE (1 << 6)
>>
>> +/* PTE (Page Table Entry) fragment field for different page sizes */
>> +#define R600_PTE_FRAG_4KB (0 << 7)
>> +#define R600_PTE_FRAG_64KB (4 << 7)
>> +#define R600_PTE_FRAG_256KB (6 << 7)
>> +
>> struct radeon_vm_pt {
>> struct radeon_bo *bo;
>> uint64_t addr;
>> diff --git a/drivers/gpu/drm/radeon/radeon_vm.c
>> b/drivers/gpu/drm/radeon/radeon_vm.c
>> index 2aae6ce..6bf656e 100644
>> --- a/drivers/gpu/drm/radeon/radeon_vm.c
>> +++ b/drivers/gpu/drm/radeon/radeon_vm.c
>> @@ -658,6 +658,84 @@ int radeon_vm_update_page_directory(struct
>> radeon_device *rdev,
>> }
>>
>> /**
>> + * radeon_vm_frag_ptes - add fragment information to PTEs
>> + *
>> + * @rdev: radeon_device pointer
>> + * @ib: IB for the update
>> + * @pe_start: first PTE to handle
>> + * @pe_end: last PTE to handle
>> + * @addr: addr those PTEs should point to
>> + * @flags: hw mapping flags
>> + *
>> + * Global and local mutex must be locked!
>> + */
>> +static void radeon_vm_frag_ptes(struct radeon_device *rdev,
>> + struct radeon_ib *ib,
>> + uint64_t pe_start, uint64_t pe_end,
>> + uint64_t addr, uint32_t flags)
>> +{
>> + /**
>> + * The MC L1 TLB supports variable sized pages, based on a fragment
>> + * field in the PTE. When this field is set to a non-zero value,
>> page
>> + * granularity is increased from 4KB to (1 << (12 + frag)). The PTE
>> + * flags are considered valid for all PTEs within the fragment
>> range
>> + * and corresponding mappings are assumed to be physically
>> contiguous.
>> + *
>> + * The L1 TLB can store a single PTE for the whole fragment,
>> + * significantly increasing the space available for translation
>> + * caching. This leads to large improvements in throughput when the
>> + * TLB is under pressure.
>> + *
>> + * The L2 cache distributes small and large fragments into two
> ^^^^^
> Again, L2 TLB.
>
>> + * asymmetric partitions. The large fragment cache is significantly
>> + * larger. Thus, we try to use large fragments wherever possible.
>> + * Userspace can support this by aligning virtual base address and
>> + * allocation size to the fragment size.
>> + */
>> +
>> + /* NI is optimized for 256KB fragments, SI and newer for 64KB */
>> + uint64_t frag_flags = rdev->family == CHIP_CAYMAN ?
>> + R600_PTE_FRAG_256KB : R600_PTE_FRAG_64KB;
>> + uint64_t frag_align = rdev->family == CHIP_CAYMAN ? 0x200 : 0x80;
>> +
>> + uint64_t frag_start = ALIGN(pe_start, frag_align);
>> + uint64_t frag_end = pe_end & ~(frag_align - 1);
>> +
>> + unsigned count;
>> +
>> + /* system pages are non continuously */
>> + if ((flags & R600_PTE_SYSTEM) || !(flags & R600_PTE_VALID) ||
>> + (frag_start >= frag_end)) {
>> +
>> + count = (pe_end - pe_start) / 8;
>> + radeon_asic_vm_set_page(rdev, ib, pe_start, addr, count,
>> + RADEON_GPU_PAGE_SIZE, flags);
>> + return;
>> + }
>> +
>> + /* handle the 4K area at the beginning */
>> + if (pe_start != frag_start) {
>> + count = (frag_start - pe_start) / 8;
>> + radeon_asic_vm_set_page(rdev, ib, pe_start, addr, count,
>> + RADEON_GPU_PAGE_SIZE, flags);
>> + addr += RADEON_GPU_PAGE_SIZE * count;
>> + }
>> +
>> + /* handle the area in the middle */
>> + count = (frag_end - frag_start) / 8;
>> + radeon_asic_vm_set_page(rdev, ib, frag_start, addr, count,
>> + RADEON_GPU_PAGE_SIZE, flags | frag_flags);
>> +
>> + /* handle the 4K area at the end */
>> + if (frag_end != pe_end) {
>> + addr += RADEON_GPU_PAGE_SIZE * count;
>> + count = (pe_end - frag_end) / 8;
>> + radeon_asic_vm_set_page(rdev, ib, frag_end, addr, count,
>> + RADEON_GPU_PAGE_SIZE, flags);
>> + }
>> +}
>> +
>> +/**
>> * radeon_vm_update_ptes - make sure that page tables are valid
>> *
>> * @rdev: radeon_device pointer
>> @@ -703,10 +781,9 @@ static void radeon_vm_update_ptes(struct
>> radeon_device *rdev,
>> if ((last_pte + 8 * count) != pte) {
>>
>> if (count) {
>> - radeon_asic_vm_set_page(rdev, ib, last_pte,
>> - last_dst, count,
>> - RADEON_GPU_PAGE_SIZE,
>> - flags);
>> + radeon_vm_frag_ptes(rdev, ib, last_pte,
>> + last_pte + 8 * count,
>> + last_dst, flags);
>> }
>>
>> count = nptes;
>> @@ -721,9 +798,9 @@ static void radeon_vm_update_ptes(struct
>> radeon_device *rdev,
>> }
>>
>> if (count) {
>> - radeon_asic_vm_set_page(rdev, ib, last_pte,
>> - last_dst, count,
>> - RADEON_GPU_PAGE_SIZE, flags);
>> + radeon_vm_frag_ptes(rdev, ib, last_pte,
>> + last_pte + 8 * count,
>> + last_dst, flags);
>> }
>> }
>>
>> diff --git a/drivers/gpu/drm/radeon/si.c b/drivers/gpu/drm/radeon/si.c
>> index 22a63c9..dece3be 100644
>> --- a/drivers/gpu/drm/radeon/si.c
>> +++ b/drivers/gpu/drm/radeon/si.c
>> @@ -4044,18 +4044,21 @@ static int si_pcie_gart_enable(struct
>> radeon_device *rdev)
>> WREG32(MC_VM_MX_L1_TLB_CNTL,
>> (0xA << 7) |
>> ENABLE_L1_TLB |
>> + ENABLE_L1_FRAGMENT_PROCESSING |
>> SYSTEM_ACCESS_MODE_NOT_IN_SYS |
>> ENABLE_ADVANCED_DRIVER_MODEL |
>> SYSTEM_APERTURE_UNMAPPED_ACCESS_PASS_THRU);
>> /* Setup L2 cache */
>> WREG32(VM_L2_CNTL, ENABLE_L2_CACHE |
>> + ENABLE_L2_FRAGMENT_PROCESSING |
>> ENABLE_L2_PTE_CACHE_LRU_UPDATE_BY_WRITE |
>> ENABLE_L2_PDE0_CACHE_LRU_UPDATE_BY_WRITE |
>> EFFECTIVE_L2_QUEUE_SIZE(7) |
>> CONTEXT1_IDENTITY_ACCESS_MODE(1));
>> WREG32(VM_L2_CNTL2, INVALIDATE_ALL_L1_TLBS | INVALIDATE_L2_CACHE);
>> WREG32(VM_L2_CNTL3, L2_CACHE_BIGK_ASSOCIATIVITY |
>> - L2_CACHE_BIGK_FRAGMENT_SIZE(0));
>> + BANK_SELECT(4) |
>> + L2_CACHE_BIGK_FRAGMENT_SIZE(4));
>> /* setup context0 */
>> WREG32(VM_CONTEXT0_PAGE_TABLE_START_ADDR, rdev->mc.gtt_start >>
>> 12);
>> WREG32(VM_CONTEXT0_PAGE_TABLE_END_ADDR, rdev->mc.gtt_end >> 12);
> _______________________________________________
> dri-devel mailing list
> dri-devel at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/dri-devel
More information about the dri-devel
mailing list