[Mesa-dev] [RFC PATCH] radeonsi: set a per-buffer flag that disables inter-process sharing (v2)
Marek Olšák
maraeo at gmail.com
Tue Jul 18 20:08:44 UTC 2017
From: Marek Olšák <marek.olsak at amd.com>
For lower overhead in the CS ioctl.
Winsys allocators are not used with interprocess-sharable resources.
v2: It shouldn't crash anymore, but the kernel will reject the new flag.
---
src/gallium/drivers/radeon/r600_buffer_common.c | 7 +++++
src/gallium/drivers/radeon/radeon_winsys.h | 20 +++++++++++---
src/gallium/winsys/amdgpu/drm/amdgpu_bo.c | 36 ++++++++++++++++---------
src/gallium/winsys/radeon/drm/radeon_drm_bo.c | 27 +++++++++++--------
4 files changed, 62 insertions(+), 28 deletions(-)
diff --git a/src/gallium/drivers/radeon/r600_buffer_common.c b/src/gallium/drivers/radeon/r600_buffer_common.c
index dd1c209..2747ac4 100644
--- a/src/gallium/drivers/radeon/r600_buffer_common.c
+++ b/src/gallium/drivers/radeon/r600_buffer_common.c
@@ -160,20 +160,27 @@ void r600_init_resource_fields(struct r600_common_screen *rscreen,
}
/* Tiled textures are unmappable. Always put them in VRAM. */
if ((res->b.b.target != PIPE_BUFFER && !rtex->surface.is_linear) ||
res->flags & R600_RESOURCE_FLAG_UNMAPPABLE) {
res->domains = RADEON_DOMAIN_VRAM;
res->flags |= RADEON_FLAG_NO_CPU_ACCESS |
RADEON_FLAG_GTT_WC;
}
+ /* Only displayable single-sample textures can be shared between
+ * processes. */
+ if (res->b.b.target == PIPE_BUFFER ||
+ res->b.b.nr_samples >= 2 ||
+ rtex->surface.micro_tile_mode != RADEON_MICRO_MODE_DISPLAY)
+ res->flags |= RADEON_FLAG_NO_INTERPROCESS_SHARING;
+
/* If VRAM is just stolen system memory, allow both VRAM and
* GTT, whichever has free space. If a buffer is evicted from
* VRAM to GTT, it will stay there.
*
* DRM 3.6.0 has good BO move throttling, so we can allow VRAM-only
* placements even with a low amount of stolen VRAM.
*/
if (!rscreen->info.has_dedicated_vram &&
(rscreen->info.drm_major < 3 || rscreen->info.drm_minor < 6) &&
res->domains == RADEON_DOMAIN_VRAM) {
diff --git a/src/gallium/drivers/radeon/radeon_winsys.h b/src/gallium/drivers/radeon/radeon_winsys.h
index 351edcd..0abcb56 100644
--- a/src/gallium/drivers/radeon/radeon_winsys.h
+++ b/src/gallium/drivers/radeon/radeon_winsys.h
@@ -47,20 +47,21 @@ enum radeon_bo_domain { /* bitfield */
RADEON_DOMAIN_GTT = 2,
RADEON_DOMAIN_VRAM = 4,
RADEON_DOMAIN_VRAM_GTT = RADEON_DOMAIN_VRAM | RADEON_DOMAIN_GTT
};
enum radeon_bo_flag { /* bitfield */
RADEON_FLAG_GTT_WC = (1 << 0),
RADEON_FLAG_NO_CPU_ACCESS = (1 << 1),
RADEON_FLAG_NO_SUBALLOC = (1 << 2),
RADEON_FLAG_SPARSE = (1 << 3),
+ RADEON_FLAG_NO_INTERPROCESS_SHARING = (1 << 4),
};
enum radeon_bo_usage { /* bitfield */
RADEON_USAGE_READ = 2,
RADEON_USAGE_WRITE = 4,
RADEON_USAGE_READWRITE = RADEON_USAGE_READ | RADEON_USAGE_WRITE,
/* The winsys ensures that the CS submission will be scheduled after
* previously flushed CSs referencing this BO in a conflicting way.
*/
@@ -685,28 +686,33 @@ static inline enum radeon_bo_domain radeon_domain_from_heap(enum radeon_heap hea
default:
assert(0);
return (enum radeon_bo_domain)0;
}
}
static inline unsigned radeon_flags_from_heap(enum radeon_heap heap)
{
switch (heap) {
case RADEON_HEAP_VRAM_NO_CPU_ACCESS:
- return RADEON_FLAG_GTT_WC | RADEON_FLAG_NO_CPU_ACCESS;
+ return RADEON_FLAG_GTT_WC |
+ RADEON_FLAG_NO_CPU_ACCESS |
+ RADEON_FLAG_NO_INTERPROCESS_SHARING;
+
case RADEON_HEAP_VRAM:
case RADEON_HEAP_VRAM_GTT:
case RADEON_HEAP_GTT_WC:
- return RADEON_FLAG_GTT_WC;
+ return RADEON_FLAG_GTT_WC |
+ RADEON_FLAG_NO_INTERPROCESS_SHARING;
+
case RADEON_HEAP_GTT:
default:
- return 0;
+ return RADEON_FLAG_NO_INTERPROCESS_SHARING;
}
}
/* The pb cache bucket is chosen to minimize pb_cache misses.
* It must be between 0 and 3 inclusive.
*/
static inline unsigned radeon_get_pb_cache_bucket_index(enum radeon_heap heap)
{
switch (heap) {
case RADEON_HEAP_VRAM_NO_CPU_ACCESS:
@@ -724,22 +730,28 @@ static inline unsigned radeon_get_pb_cache_bucket_index(enum radeon_heap heap)
/* Return the heap index for winsys allocators, or -1 on failure. */
static inline int radeon_get_heap_index(enum radeon_bo_domain domain,
enum radeon_bo_flag flags)
{
/* VRAM implies WC (write combining) */
assert(!(domain & RADEON_DOMAIN_VRAM) || flags & RADEON_FLAG_GTT_WC);
/* NO_CPU_ACCESS implies VRAM only. */
assert(!(flags & RADEON_FLAG_NO_CPU_ACCESS) || domain == RADEON_DOMAIN_VRAM);
+ /* Resources with interprocess sharing don't use any winsys allocators. */
+ if (!(flags & RADEON_FLAG_NO_INTERPROCESS_SHARING))
+ return -1;
+
/* Unsupported flags: NO_SUBALLOC, SPARSE. */
- if (flags & ~(RADEON_FLAG_GTT_WC | RADEON_FLAG_NO_CPU_ACCESS))
+ if (flags & ~(RADEON_FLAG_GTT_WC |
+ RADEON_FLAG_NO_CPU_ACCESS |
+ RADEON_FLAG_NO_INTERPROCESS_SHARING))
return -1;
switch (domain) {
case RADEON_DOMAIN_VRAM:
if (flags & RADEON_FLAG_NO_CPU_ACCESS)
return RADEON_HEAP_VRAM_NO_CPU_ACCESS;
else
return RADEON_HEAP_VRAM;
case RADEON_DOMAIN_VRAM_GTT:
return RADEON_HEAP_VRAM_GTT;
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
index 97bbe23..06b8198 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
@@ -31,20 +31,24 @@
#include "amdgpu_cs.h"
#include "os/os_time.h"
#include "state_tracker/drm_driver.h"
#include <amdgpu_drm.h>
#include <xf86drm.h>
#include <stdio.h>
#include <inttypes.h>
+#ifndef AMDGPU_GEM_CREATE_NO_INTERPROCESS_SHARING
+#define AMDGPU_GEM_CREATE_NO_INTERPROCESS_SHARING (1 << 6)
+#endif
+
/* Set to 1 for verbose output showing committed sparse buffer ranges. */
#define DEBUG_SPARSE_COMMITS 0
struct amdgpu_sparse_backing_chunk {
uint32_t begin, end;
};
static struct pb_buffer *
amdgpu_bo_create(struct radeon_winsys *rws,
uint64_t size,
@@ -395,20 +399,22 @@ static struct amdgpu_winsys_bo *amdgpu_create_bo(struct amdgpu_winsys *ws,
if (initial_domain & RADEON_DOMAIN_VRAM)
request.preferred_heap |= AMDGPU_GEM_DOMAIN_VRAM;
if (initial_domain & RADEON_DOMAIN_GTT)
request.preferred_heap |= AMDGPU_GEM_DOMAIN_GTT;
if (flags & RADEON_FLAG_NO_CPU_ACCESS)
request.flags |= AMDGPU_GEM_CREATE_NO_CPU_ACCESS;
if (flags & RADEON_FLAG_GTT_WC)
request.flags |= AMDGPU_GEM_CREATE_CPU_GTT_USWC;
+ if (flags & RADEON_FLAG_NO_INTERPROCESS_SHARING)
+ request.flags |= AMDGPU_GEM_CREATE_NO_INTERPROCESS_SHARING;
r = amdgpu_bo_alloc(ws->dev, &request, &buf_handle);
if (r) {
fprintf(stderr, "amdgpu: Failed to allocate a buffer:\n");
fprintf(stderr, "amdgpu: size : %"PRIu64" bytes\n", size);
fprintf(stderr, "amdgpu: alignment : %u bytes\n", alignment);
fprintf(stderr, "amdgpu: domains : %u\n", initial_domain);
goto error_bo_alloc;
}
@@ -1127,21 +1133,21 @@ static void amdgpu_buffer_set_metadata(struct pb_buffer *_buf,
static struct pb_buffer *
amdgpu_bo_create(struct radeon_winsys *rws,
uint64_t size,
unsigned alignment,
enum radeon_bo_domain domain,
enum radeon_bo_flag flags)
{
struct amdgpu_winsys *ws = amdgpu_winsys(rws);
struct amdgpu_winsys_bo *bo;
- unsigned usage = 0, pb_cache_bucket;
+ unsigned usage = 0, pb_cache_bucket = 0;
/* VRAM implies WC. This is not optional. */
assert(!(domain & RADEON_DOMAIN_VRAM) || flags & RADEON_FLAG_GTT_WC);
/* NO_CPU_ACCESS is valid with VRAM only. */
assert(domain == RADEON_DOMAIN_VRAM || !(flags & RADEON_FLAG_NO_CPU_ACCESS));
/* Sub-allocate small buffers from slabs. */
if (!(flags & (RADEON_FLAG_NO_SUBALLOC | RADEON_FLAG_SPARSE)) &&
size <= (1 << AMDGPU_SLAB_MAX_SIZE_LOG2) &&
@@ -1182,48 +1188,52 @@ no_slab:
/* This flag is irrelevant for the cache. */
flags &= ~RADEON_FLAG_NO_SUBALLOC;
/* Align size to page size. This is the minimum alignment for normal
* BOs. Aligning this here helps the cached bufmgr. Especially small BOs,
* like constant/uniform buffers, can benefit from better and more reuse.
*/
size = align64(size, ws->info.gart_page_size);
alignment = align(alignment, ws->info.gart_page_size);
- int heap = radeon_get_heap_index(domain, flags);
- assert(heap >= 0 && heap < RADEON_MAX_CACHED_HEAPS);
- usage = 1 << heap; /* Only set one usage bit for each heap. */
+ bool use_reusable_pool = flags & RADEON_FLAG_NO_INTERPROCESS_SHARING;
- pb_cache_bucket = radeon_get_pb_cache_bucket_index(heap);
- assert(pb_cache_bucket < ARRAY_SIZE(ws->bo_cache.buckets));
+ if (use_reusable_pool) {
+ int heap = radeon_get_heap_index(domain, flags);
+ assert(heap >= 0 && heap < RADEON_MAX_CACHED_HEAPS);
+ usage = 1 << heap; /* Only set one usage bit for each heap. */
- /* Get a buffer from the cache. */
- bo = (struct amdgpu_winsys_bo*)
- pb_cache_reclaim_buffer(&ws->bo_cache, size, alignment, usage,
- pb_cache_bucket);
- if (bo)
- return &bo->base;
+ pb_cache_bucket = radeon_get_pb_cache_bucket_index(heap);
+ assert(pb_cache_bucket < ARRAY_SIZE(ws->bo_cache.buckets));
+
+ /* Get a buffer from the cache. */
+ bo = (struct amdgpu_winsys_bo*)
+ pb_cache_reclaim_buffer(&ws->bo_cache, size, alignment, usage,
+ pb_cache_bucket);
+ if (bo)
+ return &bo->base;
+ }
/* Create a new one. */
bo = amdgpu_create_bo(ws, size, alignment, usage, domain, flags,
pb_cache_bucket);
if (!bo) {
/* Clear the cache and try again. */
pb_slabs_reclaim(&ws->bo_slabs);
pb_cache_release_all_buffers(&ws->bo_cache);
bo = amdgpu_create_bo(ws, size, alignment, usage, domain, flags,
pb_cache_bucket);
if (!bo)
return NULL;
}
- bo->u.real.use_reusable_pool = true;
+ bo->u.real.use_reusable_pool = use_reusable_pool;
return &bo->base;
}
static struct pb_buffer *amdgpu_bo_from_handle(struct radeon_winsys *rws,
struct winsys_handle *whandle,
unsigned *stride,
unsigned *offset)
{
struct amdgpu_winsys *ws = amdgpu_winsys(rws);
struct amdgpu_winsys_bo *bo;
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
index 8027a5f..15e9d38 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
@@ -907,21 +907,21 @@ static void radeon_bo_set_metadata(struct pb_buffer *_buf,
static struct pb_buffer *
radeon_winsys_bo_create(struct radeon_winsys *rws,
uint64_t size,
unsigned alignment,
enum radeon_bo_domain domain,
enum radeon_bo_flag flags)
{
struct radeon_drm_winsys *ws = radeon_drm_winsys(rws);
struct radeon_bo *bo;
- unsigned usage = 0, pb_cache_bucket;
+ unsigned usage = 0, pb_cache_bucket = 0;
assert(!(flags & RADEON_FLAG_SPARSE)); /* not supported */
/* Only 32-bit sizes are supported. */
if (size > UINT_MAX)
return NULL;
/* VRAM implies WC. This is not optional. */
if (domain & RADEON_DOMAIN_VRAM)
flags |= RADEON_FLAG_GTT_WC;
@@ -962,46 +962,51 @@ no_slab:
/* This flag is irrelevant for the cache. */
flags &= ~RADEON_FLAG_NO_SUBALLOC;
/* Align size to page size. This is the minimum alignment for normal
* BOs. Aligning this here helps the cached bufmgr. Especially small BOs,
* like constant/uniform buffers, can benefit from better and more reuse.
*/
size = align(size, ws->info.gart_page_size);
alignment = align(alignment, ws->info.gart_page_size);
- int heap = radeon_get_heap_index(domain, flags);
- assert(heap >= 0 && heap < RADEON_MAX_CACHED_HEAPS);
- usage = 1 << heap; /* Only set one usage bit for each heap. */
+ bool use_reusable_pool = flags & RADEON_FLAG_NO_INTERPROCESS_SHARING;
- pb_cache_bucket = radeon_get_pb_cache_bucket_index(heap);
- assert(pb_cache_bucket < ARRAY_SIZE(ws->bo_cache.buckets));
+ /* Shared resources don't use cached heaps. */
+ if (use_reusable_pool) {
+ int heap = radeon_get_heap_index(domain, flags);
+ assert(heap >= 0 && heap < RADEON_MAX_CACHED_HEAPS);
+ usage = 1 << heap; /* Only set one usage bit for each heap. */
- bo = radeon_bo(pb_cache_reclaim_buffer(&ws->bo_cache, size, alignment,
- usage, pb_cache_bucket));
- if (bo)
- return &bo->base;
+ pb_cache_bucket = radeon_get_pb_cache_bucket_index(heap);
+ assert(pb_cache_bucket < ARRAY_SIZE(ws->bo_cache.buckets));
+
+ bo = radeon_bo(pb_cache_reclaim_buffer(&ws->bo_cache, size, alignment,
+ usage, pb_cache_bucket));
+ if (bo)
+ return &bo->base;
+ }
bo = radeon_create_bo(ws, size, alignment, usage, domain, flags,
pb_cache_bucket);
if (!bo) {
/* Clear the cache and try again. */
if (ws->info.has_virtual_memory)
pb_slabs_reclaim(&ws->bo_slabs);
pb_cache_release_all_buffers(&ws->bo_cache);
bo = radeon_create_bo(ws, size, alignment, usage, domain, flags,
pb_cache_bucket);
if (!bo)
return NULL;
}
- bo->u.real.use_reusable_pool = true;
+ bo->u.real.use_reusable_pool = use_reusable_pool;
mtx_lock(&ws->bo_handles_mutex);
util_hash_table_set(ws->bo_handles, (void*)(uintptr_t)bo->handle, bo);
mtx_unlock(&ws->bo_handles_mutex);
return &bo->base;
}
static struct pb_buffer *radeon_winsys_bo_from_ptr(struct radeon_winsys *rws,
void *pointer, uint64_t size)
--
2.7.4
More information about the mesa-dev
mailing list