[Mesa-dev] [RFC PATCH] radeonsi: set a per-buffer flag that disables inter-process sharing (v2)
Samuel Pitoiset
samuel.pitoiset at gmail.com
Thu Jul 20 11:06:11 UTC 2017
On 07/20/2017 04:20 AM, zhoucm1 wrote:
>
>
> On 2017年07月19日 23:34, Marek Olšák wrote:
>>
>>
>> On Jul 19, 2017 3:36 AM, "zhoucm1" <david1.zhou at amd.com
>> <mailto:david1.zhou at amd.com>> wrote:
>>
>>
>>
>> On 2017年07月19日 04:08, Marek Olšák wrote:
>>
>> From: Marek Olšák <marek.olsak at amd.com
>> <mailto:marek.olsak at amd.com>>
>>
>> For lower overhead in the CS ioctl.
>> Winsys allocators are not used with interprocess-sharable
>> resources.
>>
>> Hi Marek,
>>
>> Could I know from how your this way reduces overhead in CS ioctl?
>> reusing BO to short bo list?
>>
>>
>> The kernel part of the work hasn't been done yet. The idea is that
>> nonsharable buffers don't have to be revalidated by TTM,
> OK, Maybe I only can see the whole picture of this idea when you
> complete kernel part.
> Out of curious, why/how can nonsharable buffers be revalidated by TTM
> without exposing like amdgpu_bo_make_resident api?
>
> With mentioned in another thread, if we can expose make_resident api, we
> can remove bo_list, even we can remove reservation operation in CS ioctl.
> And now, I think our bo list is a very bad design,
> first, umd must create bo list for every command submission, this is a
> extra cpu overhead compared with traditional way.
> second, kernel also have to iterate the list, when bo list is too long,
> like OpenCL program, they always throw several thousands BOs to bo list,
> reservation must keep these thousands ww_mutex safe, CPU overhead is too
> big.
>
> So I strongly suggest we should expose make_resident api to user space.
> if cannot, I want to know any specific reason to see if we can solve it.
Introducing a make_resident API will also help ARB_bindless_texture a
lot, because currently when a texture is marked as resident, we have to
re-validate the related buffers for every new CS, like traditional
buffers. With a resident BO list the whole mechanism could be skipped.
>
>
> Regards,
> David Zhou
>> so it should remove a lot of kernel overhead and the BO list remains
>> the same.
>>
>> Marek
>>
>>
>>
>> Thanks,
>> David Zhou
>>
>>
>> v2: It shouldn't crash anymore, but the kernel will reject the
>> new flag.
>> ---
>> src/gallium/drivers/radeon/r600_buffer_common.c | 7 +++++
>> src/gallium/drivers/radeon/radeon_winsys.h | 20 +++++++++++---
>> src/gallium/winsys/amdgpu/drm/amdgpu_bo.c | 36
>> ++++++++++++++++---------
>> src/gallium/winsys/radeon/drm/radeon_drm_bo.c | 27
>> +++++++++++--------
>> 4 files changed, 62 insertions(+), 28 deletions(-)
>>
>> diff --git a/src/gallium/drivers/radeon/r600_buffer_common.c
>> b/src/gallium/drivers/radeon/r600_buffer_common.c
>> index dd1c209..2747ac4 100644
>> --- a/src/gallium/drivers/radeon/r600_buffer_common.c
>> +++ b/src/gallium/drivers/radeon/r600_buffer_common.c
>> @@ -160,20 +160,27 @@ void r600_init_resource_fields(struct
>> r600_common_screen *rscreen,
>> }
>> /* Tiled textures are unmappable. Always put them in
>> VRAM. */
>> if ((res->b.b.target != PIPE_BUFFER &&
>> !rtex->surface.is_linear) ||
>> res->flags & R600_RESOURCE_FLAG_UNMAPPABLE) {
>> res->domains = RADEON_DOMAIN_VRAM;
>> res->flags |= RADEON_FLAG_NO_CPU_ACCESS |
>> RADEON_FLAG_GTT_WC;
>> }
>> + /* Only displayable single-sample textures can be
>> shared between
>> + * processes. */
>> + if (res->b.b.target == PIPE_BUFFER ||
>> + res->b.b.nr_samples >= 2 ||
>> + rtex->surface.micro_tile_mode !=
>> RADEON_MICRO_MODE_DISPLAY)
>> + res->flags |= RADEON_FLAG_NO_INTERPROCESS_SHARING;
>> +
>> /* If VRAM is just stolen system memory, allow both
>> VRAM and
>> * GTT, whichever has free space. If a buffer is
>> evicted from
>> * VRAM to GTT, it will stay there.
>> *
>> * DRM 3.6.0 has good BO move throttling, so we can
>> allow VRAM-only
>> * placements even with a low amount of stolen VRAM.
>> */
>> if (!rscreen->info.has_dedicated_vram &&
>> (rscreen->info.drm_major < 3 ||
>> rscreen->info.drm_minor < 6) &&
>> res->domains == RADEON_DOMAIN_VRAM) {
>> diff --git a/src/gallium/drivers/radeon/radeon_winsys.h
>> b/src/gallium/drivers/radeon/radeon_winsys.h
>> index 351edcd..0abcb56 100644
>> --- a/src/gallium/drivers/radeon/radeon_winsys.h
>> +++ b/src/gallium/drivers/radeon/radeon_winsys.h
>> @@ -47,20 +47,21 @@ enum radeon_bo_domain { /* bitfield */
>> RADEON_DOMAIN_GTT = 2,
>> RADEON_DOMAIN_VRAM = 4,
>> RADEON_DOMAIN_VRAM_GTT = RADEON_DOMAIN_VRAM |
>> RADEON_DOMAIN_GTT
>> };
>> enum radeon_bo_flag { /* bitfield */
>> RADEON_FLAG_GTT_WC = (1 << 0),
>> RADEON_FLAG_NO_CPU_ACCESS = (1 << 1),
>> RADEON_FLAG_NO_SUBALLOC = (1 << 2),
>> RADEON_FLAG_SPARSE = (1 << 3),
>> + RADEON_FLAG_NO_INTERPROCESS_SHARING = (1 << 4),
>> };
>> enum radeon_bo_usage { /* bitfield */
>> RADEON_USAGE_READ = 2,
>> RADEON_USAGE_WRITE = 4,
>> RADEON_USAGE_READWRITE = RADEON_USAGE_READ |
>> RADEON_USAGE_WRITE,
>> /* The winsys ensures that the CS submission will be
>> scheduled after
>> * previously flushed CSs referencing this BO in a
>> conflicting way.
>> */
>> @@ -685,28 +686,33 @@ static inline enum radeon_bo_domain
>> radeon_domain_from_heap(enum radeon_heap hea
>> default:
>> assert(0);
>> return (enum radeon_bo_domain)0;
>> }
>> }
>> static inline unsigned radeon_flags_from_heap(enum
>> radeon_heap heap)
>> {
>> switch (heap) {
>> case RADEON_HEAP_VRAM_NO_CPU_ACCESS:
>> - return RADEON_FLAG_GTT_WC | RADEON_FLAG_NO_CPU_ACCESS;
>> + return RADEON_FLAG_GTT_WC |
>> + RADEON_FLAG_NO_CPU_ACCESS |
>> + RADEON_FLAG_NO_INTERPROCESS_SHARING;
>> +
>> case RADEON_HEAP_VRAM:
>> case RADEON_HEAP_VRAM_GTT:
>> case RADEON_HEAP_GTT_WC:
>> - return RADEON_FLAG_GTT_WC;
>> + return RADEON_FLAG_GTT_WC |
>> + RADEON_FLAG_NO_INTERPROCESS_SHARING;
>> +
>> case RADEON_HEAP_GTT:
>> default:
>> - return 0;
>> + return RADEON_FLAG_NO_INTERPROCESS_SHARING;
>> }
>> }
>> /* The pb cache bucket is chosen to minimize pb_cache misses.
>> * It must be between 0 and 3 inclusive.
>> */
>> static inline unsigned radeon_get_pb_cache_bucket_index(enum
>> radeon_heap heap)
>> {
>> switch (heap) {
>> case RADEON_HEAP_VRAM_NO_CPU_ACCESS:
>> @@ -724,22 +730,28 @@ static inline unsigned
>> radeon_get_pb_cache_bucket_index(enum radeon_heap heap)
>> /* Return the heap index for winsys allocators, or -1 on
>> failure. */
>> static inline int radeon_get_heap_index(enum
>> radeon_bo_domain domain,
>> enum radeon_bo_flag
>> flags)
>> {
>> /* VRAM implies WC (write combining) */
>> assert(!(domain & RADEON_DOMAIN_VRAM) || flags &
>> RADEON_FLAG_GTT_WC);
>> /* NO_CPU_ACCESS implies VRAM only. */
>> assert(!(flags & RADEON_FLAG_NO_CPU_ACCESS) || domain ==
>> RADEON_DOMAIN_VRAM);
>> + /* Resources with interprocess sharing don't use any
>> winsys allocators. */
>> + if (!(flags & RADEON_FLAG_NO_INTERPROCESS_SHARING))
>> + return -1;
>> +
>> /* Unsupported flags: NO_SUBALLOC, SPARSE. */
>> - if (flags & ~(RADEON_FLAG_GTT_WC |
>> RADEON_FLAG_NO_CPU_ACCESS))
>> + if (flags & ~(RADEON_FLAG_GTT_WC |
>> + RADEON_FLAG_NO_CPU_ACCESS |
>> + RADEON_FLAG_NO_INTERPROCESS_SHARING))
>> return -1;
>> switch (domain) {
>> case RADEON_DOMAIN_VRAM:
>> if (flags & RADEON_FLAG_NO_CPU_ACCESS)
>> return RADEON_HEAP_VRAM_NO_CPU_ACCESS;
>> else
>> return RADEON_HEAP_VRAM;
>> case RADEON_DOMAIN_VRAM_GTT:
>> return RADEON_HEAP_VRAM_GTT;
>> diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
>> b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
>> index 97bbe23..06b8198 100644
>> --- a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
>> +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
>> @@ -31,20 +31,24 @@
>> #include "amdgpu_cs.h"
>> #include "os/os_time.h"
>> #include "state_tracker/drm_driver.h"
>> #include <amdgpu_drm.h>
>> #include <xf86drm.h>
>> #include <stdio.h>
>> #include <inttypes.h>
>> +#ifndef AMDGPU_GEM_CREATE_NO_INTERPROCESS_SHARING
>> +#define AMDGPU_GEM_CREATE_NO_INTERPROCESS_SHARING (1 << 6)
>> +#endif
>> +
>> /* Set to 1 for verbose output showing committed sparse
>> buffer ranges. */
>> #define DEBUG_SPARSE_COMMITS 0
>> struct amdgpu_sparse_backing_chunk {
>> uint32_t begin, end;
>> };
>> static struct pb_buffer *
>> amdgpu_bo_create(struct radeon_winsys *rws,
>> uint64_t size,
>> @@ -395,20 +399,22 @@ static struct amdgpu_winsys_bo
>> *amdgpu_create_bo(struct amdgpu_winsys *ws,
>> if (initial_domain & RADEON_DOMAIN_VRAM)
>> request.preferred_heap |= AMDGPU_GEM_DOMAIN_VRAM;
>> if (initial_domain & RADEON_DOMAIN_GTT)
>> request.preferred_heap |= AMDGPU_GEM_DOMAIN_GTT;
>> if (flags & RADEON_FLAG_NO_CPU_ACCESS)
>> request.flags |= AMDGPU_GEM_CREATE_NO_CPU_ACCESS;
>> if (flags & RADEON_FLAG_GTT_WC)
>> request.flags |= AMDGPU_GEM_CREATE_CPU_GTT_USWC;
>> + if (flags & RADEON_FLAG_NO_INTERPROCESS_SHARING)
>> + request.flags |= AMDGPU_GEM_CREATE_NO_INTERPROCESS_SHARING;
>> r = amdgpu_bo_alloc(ws->dev, &request, &buf_handle);
>> if (r) {
>> fprintf(stderr, "amdgpu: Failed to allocate a buffer:\n");
>> fprintf(stderr, "amdgpu: size : %"PRIu64"
>> bytes\n", size);
>> fprintf(stderr, "amdgpu: alignment : %u bytes\n",
>> alignment);
>> fprintf(stderr, "amdgpu: domains : %u\n",
>> initial_domain);
>> goto error_bo_alloc;
>> }
>> @@ -1127,21 +1133,21 @@ static void
>> amdgpu_buffer_set_metadata(struct pb_buffer *_buf,
>> static struct pb_buffer *
>> amdgpu_bo_create(struct radeon_winsys *rws,
>> uint64_t size,
>> unsigned alignment,
>> enum radeon_bo_domain domain,
>> enum radeon_bo_flag flags)
>> {
>> struct amdgpu_winsys *ws = amdgpu_winsys(rws);
>> struct amdgpu_winsys_bo *bo;
>> - unsigned usage = 0, pb_cache_bucket;
>> + unsigned usage = 0, pb_cache_bucket = 0;
>> /* VRAM implies WC. This is not optional. */
>> assert(!(domain & RADEON_DOMAIN_VRAM) || flags &
>> RADEON_FLAG_GTT_WC);
>> /* NO_CPU_ACCESS is valid with VRAM only. */
>> assert(domain == RADEON_DOMAIN_VRAM || !(flags &
>> RADEON_FLAG_NO_CPU_ACCESS));
>> /* Sub-allocate small buffers from slabs. */
>> if (!(flags & (RADEON_FLAG_NO_SUBALLOC |
>> RADEON_FLAG_SPARSE)) &&
>> size <= (1 << AMDGPU_SLAB_MAX_SIZE_LOG2) &&
>> @@ -1182,48 +1188,52 @@ no_slab:
>> /* This flag is irrelevant for the cache. */
>> flags &= ~RADEON_FLAG_NO_SUBALLOC;
>> /* Align size to page size. This is the minimum
>> alignment for normal
>> * BOs. Aligning this here helps the cached bufmgr.
>> Especially small BOs,
>> * like constant/uniform buffers, can benefit from better
>> and more reuse.
>> */
>> size = align64(size, ws->info.gart_page_size);
>> alignment = align(alignment, ws->info.gart_page_size);
>> - int heap = radeon_get_heap_index(domain, flags);
>> - assert(heap >= 0 && heap < RADEON_MAX_CACHED_HEAPS);
>> - usage = 1 << heap; /* Only set one usage bit for each heap. */
>> + bool use_reusable_pool = flags &
>> RADEON_FLAG_NO_INTERPROCESS_SHARING;
>> - pb_cache_bucket = radeon_get_pb_cache_bucket_index(heap);
>> - assert(pb_cache_bucket < ARRAY_SIZE(ws->bo_cache.buckets));
>> + if (use_reusable_pool) {
>> + int heap = radeon_get_heap_index(domain, flags);
>> + assert(heap >= 0 && heap < RADEON_MAX_CACHED_HEAPS);
>> + usage = 1 << heap; /* Only set one usage bit for each
>> heap. */
>> - /* Get a buffer from the cache. */
>> - bo = (struct amdgpu_winsys_bo*)
>> - pb_cache_reclaim_buffer(&ws->bo_cache, size,
>> alignment, usage,
>> - pb_cache_bucket);
>> - if (bo)
>> - return &bo->base;
>> + pb_cache_bucket = radeon_get_pb_cache_bucket_index(heap);
>> + assert(pb_cache_bucket <
>> ARRAY_SIZE(ws->bo_cache.buckets));
>> +
>> + /* Get a buffer from the cache. */
>> + bo = (struct amdgpu_winsys_bo*)
>> + pb_cache_reclaim_buffer(&ws->bo_cache, size,
>> alignment, usage,
>> + pb_cache_bucket);
>> + if (bo)
>> + return &bo->base;
>> + }
>> /* Create a new one. */
>> bo = amdgpu_create_bo(ws, size, alignment, usage, domain,
>> flags,
>> pb_cache_bucket);
>> if (!bo) {
>> /* Clear the cache and try again. */
>> pb_slabs_reclaim(&ws->bo_slabs);
>> pb_cache_release_all_buffers(&ws->bo_cache);
>> bo = amdgpu_create_bo(ws, size, alignment, usage,
>> domain, flags,
>> pb_cache_bucket);
>> if (!bo)
>> return NULL;
>> }
>> - bo->u.real.use_reusable_pool = true;
>> + bo->u.real.use_reusable_pool = use_reusable_pool;
>> return &bo->base;
>> }
>> static struct pb_buffer *amdgpu_bo_from_handle(struct
>> radeon_winsys *rws,
>> struct winsys_handle *whandle,
>> unsigned *stride,
>> unsigned *offset)
>> {
>> struct amdgpu_winsys *ws = amdgpu_winsys(rws);
>> struct amdgpu_winsys_bo *bo;
>> diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
>> b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
>> index 8027a5f..15e9d38 100644
>> --- a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
>> +++ b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
>> @@ -907,21 +907,21 @@ static void
>> radeon_bo_set_metadata(struct pb_buffer *_buf,
>> static struct pb_buffer *
>> radeon_winsys_bo_create(struct radeon_winsys *rws,
>> uint64_t size,
>> unsigned alignment,
>> enum radeon_bo_domain domain,
>> enum radeon_bo_flag flags)
>> {
>> struct radeon_drm_winsys *ws = radeon_drm_winsys(rws);
>> struct radeon_bo *bo;
>> - unsigned usage = 0, pb_cache_bucket;
>> + unsigned usage = 0, pb_cache_bucket = 0;
>> assert(!(flags & RADEON_FLAG_SPARSE)); /* not supported */
>> /* Only 32-bit sizes are supported. */
>> if (size > UINT_MAX)
>> return NULL;
>> /* VRAM implies WC. This is not optional. */
>> if (domain & RADEON_DOMAIN_VRAM)
>> flags |= RADEON_FLAG_GTT_WC;
>> @@ -962,46 +962,51 @@ no_slab:
>> /* This flag is irrelevant for the cache. */
>> flags &= ~RADEON_FLAG_NO_SUBALLOC;
>> /* Align size to page size. This is the minimum
>> alignment for normal
>> * BOs. Aligning this here helps the cached bufmgr.
>> Especially small BOs,
>> * like constant/uniform buffers, can benefit from
>> better and more reuse.
>> */
>> size = align(size, ws->info.gart_page_size);
>> alignment = align(alignment, ws->info.gart_page_size);
>> - int heap = radeon_get_heap_index(domain, flags);
>> - assert(heap >= 0 && heap < RADEON_MAX_CACHED_HEAPS);
>> - usage = 1 << heap; /* Only set one usage bit for each
>> heap. */
>> + bool use_reusable_pool = flags &
>> RADEON_FLAG_NO_INTERPROCESS_SHARING;
>> - pb_cache_bucket = radeon_get_pb_cache_bucket_index(heap);
>> - assert(pb_cache_bucket < ARRAY_SIZE(ws->bo_cache.buckets));
>> + /* Shared resources don't use cached heaps. */
>> + if (use_reusable_pool) {
>> + int heap = radeon_get_heap_index(domain, flags);
>> + assert(heap >= 0 && heap < RADEON_MAX_CACHED_HEAPS);
>> + usage = 1 << heap; /* Only set one usage bit for each
>> heap. */
>> - bo = radeon_bo(pb_cache_reclaim_buffer(&ws->bo_cache,
>> size, alignment,
>> - usage,
>> pb_cache_bucket));
>> - if (bo)
>> - return &bo->base;
>> + pb_cache_bucket = radeon_get_pb_cache_bucket_index(heap);
>> + assert(pb_cache_bucket <
>> ARRAY_SIZE(ws->bo_cache.buckets));
>> +
>> + bo = radeon_bo(pb_cache_reclaim_buffer(&ws->bo_cache,
>> size, alignment,
>> + usage, pb_cache_bucket));
>> + if (bo)
>> + return &bo->base;
>> + }
>> bo = radeon_create_bo(ws, size, alignment, usage,
>> domain, flags,
>> pb_cache_bucket);
>> if (!bo) {
>> /* Clear the cache and try again. */
>> if (ws->info.has_virtual_memory)
>> pb_slabs_reclaim(&ws->bo_slabs);
>> pb_cache_release_all_buffers(&ws->bo_cache);
>> bo = radeon_create_bo(ws, size, alignment, usage,
>> domain, flags,
>> pb_cache_bucket);
>> if (!bo)
>> return NULL;
>> }
>> - bo->u.real.use_reusable_pool = true;
>> + bo->u.real.use_reusable_pool = use_reusable_pool;
>> mtx_lock(&ws->bo_handles_mutex);
>> util_hash_table_set(ws->bo_handles,
>> (void*)(uintptr_t)bo->handle, bo);
>> mtx_unlock(&ws->bo_handles_mutex);
>> return &bo->base;
>> }
>> static struct pb_buffer *radeon_winsys_bo_from_ptr(struct
>> radeon_winsys *rws,
>> void *pointer, uint64_t size)
>>
>>
>>
>
>
>
> N�n�r����)em�h�yhiם�w^��
>
More information about the amd-gfx
mailing list