Same for OA<div><br></div><div>Cheers</div><div><br></div><div>Mike<br><br><div class="gmail_quote"><div dir="ltr">On Tue, 27 Nov 2018, 01:57 Marek Olšák, <<a href="mailto:maraeo@gmail.com">maraeo@gmail.com</a>> wrote:<br></div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">From: Marek Olšák <<a href="mailto:marek.olsak@amd.com" target="_blank">marek.olsak@amd.com</a>><br>
<br>
---<br>
src/gallium/drivers/radeon/radeon_winsys.h | 4 +-<br>
src/gallium/winsys/amdgpu/drm/amdgpu_bo.c | 55 +++++++++++++---------<br>
2 files changed, 36 insertions(+), 23 deletions(-)<br>
<br>
diff --git a/src/gallium/drivers/radeon/radeon_winsys.h b/src/gallium/drivers/radeon/radeon_winsys.h<br>
index 3d0bb75ef6e..a5dd3e6f9b1 100644<br>
--- a/src/gallium/drivers/radeon/radeon_winsys.h<br>
+++ b/src/gallium/drivers/radeon/radeon_winsys.h<br>
@@ -45,21 +45,23 @@ enum radeon_bo_layout {<br>
RADEON_LAYOUT_LINEAR = 0,<br>
RADEON_LAYOUT_TILED,<br>
RADEON_LAYOUT_SQUARETILED,<br>
<br>
RADEON_LAYOUT_UNKNOWN<br>
};<br>
<br>
enum radeon_bo_domain { /* bitfield */<br>
RADEON_DOMAIN_GTT = 2,<br>
RADEON_DOMAIN_VRAM = 4,<br>
- RADEON_DOMAIN_VRAM_GTT = RADEON_DOMAIN_VRAM | RADEON_DOMAIN_GTT<br>
+ RADEON_DOMAIN_VRAM_GTT = RADEON_DOMAIN_VRAM | RADEON_DOMAIN_GTT,<br>
+ RADEON_DOMAIN_GDS = 8,<br>
+ RADEON_DOMAIN_OA = 16,<br>
};<br>
<br>
enum radeon_bo_flag { /* bitfield */<br>
RADEON_FLAG_GTT_WC = (1 << 0),<br>
RADEON_FLAG_NO_CPU_ACCESS = (1 << 1),<br>
RADEON_FLAG_NO_SUBALLOC = (1 << 2),<br>
RADEON_FLAG_SPARSE = (1 << 3),<br>
RADEON_FLAG_NO_INTERPROCESS_SHARING = (1 << 4),<br>
RADEON_FLAG_READ_ONLY = (1 << 5),<br>
RADEON_FLAG_32BIT = (1 << 6),<br>
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c<br>
index a9170a2bc69..1470c873a6a 100644<br>
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c<br>
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c<br>
@@ -177,22 +177,24 @@ void amdgpu_bo_destroy(struct pb_buffer *_buf)<br>
simple_mtx_lock(&ws->global_bo_list_lock);<br>
LIST_DEL(&bo->u.real.global_list_item);<br>
ws->num_buffers--;<br>
simple_mtx_unlock(&ws->global_bo_list_lock);<br>
}<br>
<br>
simple_mtx_lock(&ws->bo_export_table_lock);<br>
util_hash_table_remove(ws->bo_export_table, bo->bo);<br>
simple_mtx_unlock(&ws->bo_export_table_lock);<br>
<br>
- amdgpu_bo_va_op(bo->bo, 0, bo->base.size, bo->va, 0, AMDGPU_VA_OP_UNMAP);<br>
- amdgpu_va_range_free(bo->u.real.va_handle);<br>
+ if (bo->initial_domain & RADEON_DOMAIN_VRAM_GTT) {<br>
+ amdgpu_bo_va_op(bo->bo, 0, bo->base.size, bo->va, 0, AMDGPU_VA_OP_UNMAP);<br>
+ amdgpu_va_range_free(bo->u.real.va_handle);<br>
+ }<br>
amdgpu_bo_free(bo->bo);<br>
<br>
amdgpu_bo_remove_fences(bo);<br>
<br>
if (bo->initial_domain & RADEON_DOMAIN_VRAM)<br>
ws->allocated_vram -= align64(bo->base.size, ws->info.gart_page_size);<br>
else if (bo->initial_domain & RADEON_DOMAIN_GTT)<br>
ws->allocated_gtt -= align64(bo->base.size, ws->info.gart_page_size);<br>
<br>
if (bo->u.real.map_count >= 1) {<br>
@@ -418,25 +420,26 @@ static struct amdgpu_winsys_bo *amdgpu_create_bo(struct amdgpu_winsys *ws,<br>
unsigned alignment,<br>
enum radeon_bo_domain initial_domain,<br>
unsigned flags,<br>
int heap)<br>
{<br>
struct amdgpu_bo_alloc_request request = {0};<br>
amdgpu_bo_handle buf_handle;<br>
uint64_t va = 0;<br>
struct amdgpu_winsys_bo *bo;<br>
amdgpu_va_handle va_handle;<br>
- unsigned va_gap_size;<br>
int r;<br>
<br>
/* VRAM or GTT must be specified, but not both at the same time. */<br>
- assert(util_bitcount(initial_domain & RADEON_DOMAIN_VRAM_GTT) == 1);<br>
+ assert(util_bitcount(initial_domain & (RADEON_DOMAIN_VRAM_GTT |<br>
+ RADEON_DOMAIN_GDS |<br>
+ RADEON_DOMAIN_OA)) == 1);<br>
<br>
/* Gfx9: Overallocate the size to the next power of two for faster address<br>
* translation if we don't waste too much memory.<br>
*/<br>
if (ws->info.chip_class >= GFX9) {<br>
uint64_t next_pot_size = util_next_power_of_two64(size);<br>
<br>
/* For slightly lower than 4 GB allocations, at most 32 MB are wasted.<br>
* For slightly lower than 256 MB allocations, at most 2 MB are wasted.<br>
* For slightly lower than 64 MB allocations, at most 512 KB are wasted.<br>
@@ -464,20 +467,24 @@ static struct amdgpu_winsys_bo *amdgpu_create_bo(struct amdgpu_winsys *ws,<br>
pb_cache_init_entry(&ws->bo_cache, &bo->u.real.cache_entry, &bo->base,<br>
heap);<br>
}<br>
request.alloc_size = size;<br>
request.phys_alignment = alignment;<br>
<br>
if (initial_domain & RADEON_DOMAIN_VRAM)<br>
request.preferred_heap |= AMDGPU_GEM_DOMAIN_VRAM;<br>
if (initial_domain & RADEON_DOMAIN_GTT)<br>
request.preferred_heap |= AMDGPU_GEM_DOMAIN_GTT;<br>
+ if (initial_domain & RADEON_DOMAIN_GDS)<br>
+ request.preferred_heap |= AMDGPU_GEM_DOMAIN_GDS;<br>
+ if (initial_domain & RADEON_DOMAIN_OA)<br>
+ request.preferred_heap |= AMDGPU_GEM_DOMAIN_OA;<br>
<br>
/* Since VRAM and GTT have almost the same performance on APUs, we could<br>
* just set GTT. However, in order to decrease GTT(RAM) usage, which is<br>
* shared with the OS, allow VRAM placements too. The idea is not to use<br>
* VRAM usefully, but to use it so that it's not unused and wasted.<br>
*/<br>
if (!ws->info.has_dedicated_vram)<br>
request.preferred_heap |= AMDGPU_GEM_DOMAIN_GTT;<br>
<br>
if (flags & RADEON_FLAG_NO_CPU_ACCESS)<br>
@@ -493,41 +500,43 @@ static struct amdgpu_winsys_bo *amdgpu_create_bo(struct amdgpu_winsys *ws,<br>
<br>
r = amdgpu_bo_alloc(ws->dev, &request, &buf_handle);<br>
if (r) {<br>
fprintf(stderr, "amdgpu: Failed to allocate a buffer:\n");<br>
fprintf(stderr, "amdgpu: size : %"PRIu64" bytes\n", size);<br>
fprintf(stderr, "amdgpu: alignment : %u bytes\n", alignment);<br>
fprintf(stderr, "amdgpu: domains : %u\n", initial_domain);<br>
goto error_bo_alloc;<br>
}<br>
<br>
- va_gap_size = ws->check_vm ? MAX2(4 * alignment, 64 * 1024) : 0;<br>
+ if (initial_domain & RADEON_DOMAIN_VRAM_GTT) {<br>
+ unsigned va_gap_size = ws->check_vm ? MAX2(4 * alignment, 64 * 1024) : 0;<br>
<br>
- r = amdgpu_va_range_alloc(ws->dev, amdgpu_gpu_va_range_general,<br>
- size + va_gap_size,<br>
- amdgpu_get_optimal_vm_alignment(ws, size, alignment),<br>
- 0, &va, &va_handle,<br>
- (flags & RADEON_FLAG_32BIT ? AMDGPU_VA_RANGE_32_BIT : 0) |<br>
- AMDGPU_VA_RANGE_HIGH);<br>
- if (r)<br>
- goto error_va_alloc;<br>
+ r = amdgpu_va_range_alloc(ws->dev, amdgpu_gpu_va_range_general,<br>
+ size + va_gap_size,<br>
+ amdgpu_get_optimal_vm_alignment(ws, size, alignment),<br>
+ 0, &va, &va_handle,<br>
+ (flags & RADEON_FLAG_32BIT ? AMDGPU_VA_RANGE_32_BIT : 0) |<br>
+ AMDGPU_VA_RANGE_HIGH);<br>
+ if (r)<br>
+ goto error_va_alloc;<br>
<br>
- unsigned vm_flags = AMDGPU_VM_PAGE_READABLE |<br>
- AMDGPU_VM_PAGE_EXECUTABLE;<br>
+ unsigned vm_flags = AMDGPU_VM_PAGE_READABLE |<br>
+ AMDGPU_VM_PAGE_EXECUTABLE;<br>
<br>
- if (!(flags & RADEON_FLAG_READ_ONLY))<br>
- vm_flags |= AMDGPU_VM_PAGE_WRITEABLE;<br>
+ if (!(flags & RADEON_FLAG_READ_ONLY))<br>
+ vm_flags |= AMDGPU_VM_PAGE_WRITEABLE;<br>
<br>
- r = amdgpu_bo_va_op_raw(ws->dev, buf_handle, 0, size, va, vm_flags,<br>
+ r = amdgpu_bo_va_op_raw(ws->dev, buf_handle, 0, size, va, vm_flags,<br>
AMDGPU_VA_OP_MAP);<br>
- if (r)<br>
- goto error_va_map;<br>
+ if (r)<br>
+ goto error_va_map;<br>
+ }<br>
<br>
pipe_reference_init(&bo->base.reference, 1);<br>
bo->base.alignment = alignment;<br>
bo->base.usage = 0;<br>
bo->base.size = size;<br>
bo->base.vtbl = &amdgpu_winsys_bo_vtbl;<br>
bo->ws = ws;<br>
bo->bo = buf_handle;<br>
bo->va = va;<br>
bo->u.real.va_handle = va_handle;<br>
@@ -1328,22 +1337,24 @@ no_slab:<br>
return amdgpu_bo_sparse_create(ws, size, domain, flags);<br>
}<br>
<br>
/* This flag is irrelevant for the cache. */<br>
flags &= ~RADEON_FLAG_NO_SUBALLOC;<br>
<br>
/* Align size to page size. This is the minimum alignment for normal<br>
* BOs. Aligning this here helps the cached bufmgr. Especially small BOs,<br>
* like constant/uniform buffers, can benefit from better and more reuse.<br>
*/<br>
- size = align64(size, ws->info.gart_page_size);<br>
- alignment = align(alignment, ws->info.gart_page_size);<br>
+ if (domain & RADEON_DOMAIN_VRAM_GTT) {<br>
+ size = align64(size, ws->info.gart_page_size);<br>
+ alignment = align(alignment, ws->info.gart_page_size);<br>
+ }<br>
<br>
bool use_reusable_pool = flags & RADEON_FLAG_NO_INTERPROCESS_SHARING;<br>
<br>
if (use_reusable_pool) {<br>
heap = radeon_get_heap_index(domain, flags);<br>
assert(heap >= 0 && heap < RADEON_MAX_CACHED_HEAPS);<br>
<br>
/* Get a buffer from the cache. */<br>
bo = (struct amdgpu_winsys_bo*)<br>
pb_cache_reclaim_buffer(&ws->bo_cache, size, alignment, 0, heap);<br>
-- <br>
2.17.1<br>
<br>
_______________________________________________<br>
mesa-dev mailing list<br>
<a href="mailto:mesa-dev@lists.freedesktop.org" target="_blank">mesa-dev@lists.freedesktop.org</a><br>
<a href="https://lists.freedesktop.org/mailman/listinfo/mesa-dev" rel="noreferrer" target="_blank">https://lists.freedesktop.org/mailman/listinfo/mesa-dev</a><br>
</blockquote></div></div>