[Mesa-dev] [PATCH 03/13] gallium/util: add threaded_context as a pipe_context wrapper
Nicolai Hähnle
nhaehnle at gmail.com
Thu May 11 18:35:01 UTC 2017
On 11.05.2017 00:45, Marek Olšák wrote:
> From: Marek Olšák <marek.olsak at amd.com>
>
> ---
> src/gallium/auxiliary/Makefile.sources | 3 +
> src/gallium/auxiliary/util/u_threaded_context.c | 2300 ++++++++++++++++++++
> src/gallium/auxiliary/util/u_threaded_context.h | 349 +++
> .../auxiliary/util/u_threaded_context_calls.h | 66 +
> 4 files changed, 2718 insertions(+)
> create mode 100644 src/gallium/auxiliary/util/u_threaded_context.c
> create mode 100644 src/gallium/auxiliary/util/u_threaded_context.h
> create mode 100644 src/gallium/auxiliary/util/u_threaded_context_calls.h
>
> diff --git a/src/gallium/auxiliary/Makefile.sources b/src/gallium/auxiliary/Makefile.sources
> index dbdb3ca..baebee6 100644
> --- a/src/gallium/auxiliary/Makefile.sources
> +++ b/src/gallium/auxiliary/Makefile.sources
> @@ -294,20 +294,23 @@ C_SOURCES := \
> util/u_surfaces.h \
> util/u_tests.c \
> util/u_tests.h \
> util/u_texture.c \
> util/u_texture.h \
> util/u_tile.c \
> util/u_tile.h \
> util/u_time.h \
> util/u_transfer.c \
> util/u_transfer.h \
> + util/u_threaded_context.c \
> + util/u_threaded_context.h \
> + util/u_threaded_context_calls.h \
> util/u_upload_mgr.c \
> util/u_upload_mgr.h \
> util/u_vbuf.c \
> util/u_vbuf.h \
> util/u_video.h \
> util/u_viewport.h
>
> NIR_SOURCES := \
> nir/tgsi_to_nir.c \
> nir/tgsi_to_nir.h
> diff --git a/src/gallium/auxiliary/util/u_threaded_context.c b/src/gallium/auxiliary/util/u_threaded_context.c
> new file mode 100644
> index 0000000..9349b07
> --- /dev/null
> +++ b/src/gallium/auxiliary/util/u_threaded_context.c
> @@ -0,0 +1,2300 @@
> +/**************************************************************************
> + *
> + * Copyright 2017 Advanced Micro Devices, Inc.
> + * All Rights Reserved.
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * on the rights to use, copy, modify, merge, publish, distribute, sub
> + * license, and/or sell copies of the Software, and to permit persons to whom
> + * the Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice (including the next
> + * paragraph) shall be included in all copies or substantial portions of the
> + * Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
> + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
> + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
> + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
> + * USE OR OTHER DEALINGS IN THE SOFTWARE.
> + *
> + **************************************************************************/
> +
> +#include "util/u_threaded_context.h"
> +#include "util/u_cpu_detect.h"
> +#include "util/u_format.h"
> +#include "util/u_inlines.h"
> +#include "util/u_memory.h"
> +#include "util/u_upload_mgr.h"
> +
> +/* 0 = disabled, 1 = assertions, 2 = printfs */
> +#define TC_DEBUG 0
> +
> +#if TC_DEBUG >= 1
> +#define tc_assert assert
> +#else
> +#define tc_assert(x)
> +#endif
> +
> +#if TC_DEBUG >= 2
> +#define tc_printf printf
> +#define tc_asprintf asprintf
> +#define tc_strcmp strcmp
> +#else
> +#define tc_printf(...)
> +#define tc_asprintf(...) 0
> +#define tc_strcmp(...) 0
> +#endif
> +
> +#define TC_SENTINEL 0x5ca1ab1e
> +
> +enum tc_call_id {
> +#define CALL(name) TC_CALL_##name,
> +#include "u_threaded_context_calls.h"
> +#undef CALL
> + TC_NUM_CALLS,
> +};
> +
> +typedef void (*tc_execute)(struct pipe_context *pipe, union tc_payload *payload);
> +
> +static const tc_execute execute_func[TC_NUM_CALLS];
> +
> +static void
> +tc_batch_check(struct tc_batch *batch)
> +{
> + tc_assert(batch->sentinel == TC_SENTINEL);
> + tc_assert(batch->sentinel2 == TC_SENTINEL);
> + tc_assert(batch->num_calls <= TC_CALLS_PER_BATCH);
> +}
> +
> +static void
> +tc_debug_check(struct threaded_context *tc)
> +{
> + for (unsigned i = 0; i < TC_MAX_BATCHES; i++) {
> + tc_batch_check(&tc->batch_slots[i]);
> + tc_assert(tc->batch_slots[i].pipe == tc->pipe);
> + }
> +}
> +
> +static void
> +tc_batch_execute(void *job, int thread_index)
> +{
> + struct tc_batch *batch = job;
> + struct pipe_context *pipe = batch->pipe;
> + struct tc_call *last = &batch->call[batch->num_calls];
> +
> + tc_batch_check(batch);
> +
> + for (struct tc_call *iter = batch->call; iter != last;
> + iter += iter->num_call_slots) {
> + tc_assert(iter->sentinel == TC_SENTINEL);
> + execute_func[iter->call_id](pipe, &iter->payload);
> + }
> +
> + tc_batch_check(batch);
> + batch->num_calls = 0;
> +}
> +
> +static void
> +tc_batch_flush(struct threaded_context *tc)
> +{
> + struct tc_batch *next = &tc->batch_slots[tc->next];
> +
> + tc_assert(next->num_calls != 0);
> + tc_batch_check(next);
> + tc_debug_check(tc);
> + p_atomic_add(&tc->num_offloaded_calls, next->num_calls);
This counts the number of call slots, not the number of calls. Please
rename tc->num_offloaded_calls accordingly, or change it to actually
track the number of calls. Same for num_calls, actually. Maybe it could
be renamed to num_slots?
> +
> + util_queue_add_job(&tc->queue, next, &next->fence, tc_batch_execute,
> + NULL);
> + tc->last = tc->next;
> + tc->next = (tc->next + 1) % TC_MAX_BATCHES;
> +}
> +
> +/* This is the function that adds variable-sized calls into the current
> + * batch. It also flushes the batch if there is not enough space there.
> + * All other higher-level "add" functions use it.
> + */
> +static union tc_payload *
> +tc_add_sized_call(struct threaded_context *tc, enum tc_call_id id,
> + unsigned payload_size)
> +{
> + struct tc_batch *next = &tc->batch_slots[tc->next];
> + unsigned total_size = offsetof(struct tc_call, payload) + payload_size;
> + unsigned num_call_slots = DIV_ROUND_UP(total_size, sizeof(struct tc_call));
> +
> + tc_debug_check(tc);
> +
> + if (unlikely(next->num_calls + num_call_slots - 1 >= TC_CALLS_PER_BATCH)) {
next->num_calls + num_call_slots > TC_CALLS_PER_BATCH
> + tc_batch_flush(tc);
> + next = &tc->batch_slots[tc->next];
> + tc_assert(next->num_calls == 0);
> + }
> +
> + tc_assert(util_queue_fence_is_signalled(&next->fence));
> +
> + struct tc_call *call = &next->call[next->num_calls];
> + next->num_calls += num_call_slots;
> +
> + call->sentinel = TC_SENTINEL;
> + call->call_id = id;
> + call->num_call_slots = num_call_slots;
> +
> + tc_debug_check(tc);
> + return &call->payload;
> +}
> +
> +#define tc_add_struct_typed_call(tc, execute, type) \
> + ((struct type*)tc_add_sized_call(tc, execute, sizeof(struct type)))
> +
> +#define tc_add_slot_based_call(tc, execute, type, num_slots) \
> + ((struct type*)tc_add_sized_call(tc, execute, \
> + sizeof(struct type) + \
> + ((int)sizeof(((struct type*)NULL)->slot[0]) * \
> + ((int)num_slots - 1))))
The -1 here makes me nervous. I'd prefer to not have the -1 here, and
instead change the slot structs to have 0-sized arrays. That's more
idiomatic.
[snip]
> +/********************************************************************
> + * transfer
> + */
> +
> +struct tc_replace_buffer_storage {
> + struct pipe_resource *dst;
> + struct pipe_resource *src;
> + tc_replace_buffer_storage_func func;
> +};
> +
> +static void
> +tc_call_replace_buffer_storage(struct pipe_context *pipe,
> + union tc_payload *payload)
> +{
> + struct tc_replace_buffer_storage *p =
> + (struct tc_replace_buffer_storage *)payload;
> +
> + p->func(pipe, p->dst, p->src);
> + pipe_resource_reference(&p->dst, NULL);
> + pipe_resource_reference(&p->src, NULL);
> +}
> +
> +static bool
> +tc_invalidate_buffer(struct threaded_context *tc,
> + struct threaded_resource *tbuf)
> +{
> + /* We can't check if the buffer is idle, so we invalidate it
> + * unconditionally. */
> + struct pipe_screen *screen = tc->base.screen;
> + struct pipe_resource *new_buf;
> +
> + /* Shared, pinned, and sparse buffers can't be reallocated. */
> + if (tbuf->is_shared ||
> + tbuf->is_user_ptr ||
> + tbuf->b.flags & PIPE_RESOURCE_FLAG_SPARSE)
> + return false;
> +
> + /* Allocate a new one. */
> + new_buf = screen->resource_create(screen, &tbuf->b);
> + if (!new_buf)
> + return false;
> +
> + /* Replace the "latest" pointer. */
> + if (tbuf->latest != &tbuf->b)
> + pipe_resource_reference(&tbuf->latest, NULL);
> +
> + tbuf->latest = new_buf;
> + util_range_set_empty(&tbuf->valid_buffer_range);
> +
> + /* The valid range should point to the original buffer. */
> + threaded_resource(new_buf)->base_valid_buffer_range =
> + &tbuf->valid_buffer_range;
> +
> + /* Enqueue storage replacement of the original buffer. */
> + struct tc_replace_buffer_storage *p =
> + tc_add_struct_typed_call(tc, TC_CALL_replace_buffer_storage,
> + tc_replace_buffer_storage);
> +
> + p->func = tc->replace_buffer_storage;
> + tc_set_resource_reference(&p->dst, &tbuf->b);
> + tc_set_resource_reference(&p->src, new_buf);
> + return true;
> +}
> +
> +static unsigned
> +tc_improve_map_buffer_flags(struct threaded_context *tc,
> + struct threaded_resource *tres, unsigned usage,
> + unsigned offset, unsigned size)
> +{
> + /* Handle CPU reads trivially. */
> + if (usage & PIPE_TRANSFER_READ) {
> + /* Driver aren't allowed to do buffer invalidations. */
> + return (usage & ~PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE) |
> + TC_TRANSFER_MAP_NO_INVALIDATE |
> + TC_TRANSFER_MAP_IGNORE_VALID_RANGE;
> + }
> +
> + /* Sparse buffers can't be mapped directly. Use a staging buffer. */
> + if (tres->b.flags & PIPE_RESOURCE_FLAG_SPARSE) {
> + return (usage & ~(PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE |
> + PIPE_TRANSFER_UNSYNCHRONIZED)) |
> + PIPE_TRANSFER_DISCARD_RANGE |
Why are we allowed to discard here? Even when a range is mapped only for
writing, we can't just assume that the whole range will be written by
the application.
Also, why do we clear the unsynchronized flag? As I understand the code,
we do need to synchronize the threads because we're going to use a
staging buffer. But theoretically, if the driver had a way to do the
staging copy without waiting for previously submitted draws, it could do
so. So... I don't think it has a visible effect right now, but I'd
rather not remove the unsynchronized flag here.
This may need a bit of clarification in the big comment in the header file.
> + TC_TRANSFER_MAP_NO_INVALIDATE |
> + TC_TRANSFER_MAP_IGNORE_VALID_RANGE;
> + }
> +
> + /* See if the buffer range being mapped has never been initialized,
> + * in which case it can be mapped unsynchronized. */
> + if (!(usage & PIPE_TRANSFER_UNSYNCHRONIZED) &&
> + !tres->is_shared &&
> + !util_ranges_intersect(&tres->valid_buffer_range, offset, offset + size))
> + usage |= PIPE_TRANSFER_UNSYNCHRONIZED;
> +
> + if (!(usage & PIPE_TRANSFER_UNSYNCHRONIZED)) {
> + /* If discarding the entire range, discard the whole resource instead. */
> + if (usage & PIPE_TRANSFER_DISCARD_RANGE &&
> + offset == 0 && size == tres->b.width0)
> + usage |= PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE;
> +
> + /* Discard the whole resource if needed. */
> + if (usage & PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE) {
> + if (tc_invalidate_buffer(tc, tres))
> + usage |= PIPE_TRANSFER_UNSYNCHRONIZED;
> + else
> + usage |= PIPE_TRANSFER_DISCARD_RANGE; /* fallback */
> + }
> + }
> +
> + /* We won't need this flag anymore. */
> + /* TODO: We might not need TC_TRANSFER_MAP_NO_INVALIDATE with this. */
> + usage &= ~PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE;
> +
> + /* GL_AMD_pinned_memory and persistent mappings can't use staging
> + * buffers. */
> + if (usage & (PIPE_TRANSFER_UNSYNCHRONIZED |
> + PIPE_TRANSFER_PERSISTENT) ||
> + tres->is_user_ptr)
> + usage &= ~PIPE_TRANSFER_DISCARD_RANGE;
> +
> + /* Unsychronized buffer mappings don't have to synchronize the thread. */
> + if (usage & PIPE_TRANSFER_UNSYNCHRONIZED)
> + usage |= TC_TRANSFER_MAP_THREADED_UNSYNC; /* notify the driver */
> +
> + /* Never invalidate inside the driver and never infer "unsynchronized". */
> + return usage |
> + TC_TRANSFER_MAP_NO_INVALIDATE |
> + TC_TRANSFER_MAP_IGNORE_VALID_RANGE;
> +}
> +
> +static void *
> +tc_transfer_map(struct pipe_context *_pipe,
> + struct pipe_resource *resource, unsigned level,
> + unsigned usage, const struct pipe_box *box,
> + struct pipe_transfer **transfer)
> +{
> + struct threaded_context *tc = threaded_context(_pipe);
> + struct threaded_resource *tres = threaded_resource(resource);
> + struct pipe_context *pipe = tc->pipe;
> +
> + if (resource->target == PIPE_BUFFER) {
> + usage = tc_improve_map_buffer_flags(tc, tres, usage, box->x, box->width);
> +
> + /* Do a staging transfer within the threaded context. The driver should
> + * only get resource_copy_region.
> + */
> + if (usage & PIPE_TRANSFER_DISCARD_RANGE) {
> + struct threaded_transfer *ttrans = slab_alloc(&tc->pool_transfers);
> + uint8_t *map;
> +
> + ttrans->staging = NULL;
> +
> + u_upload_alloc(tc->base.stream_uploader, 0,
> + box->width + (box->x % tc->map_buffer_alignment),
> + 64, &ttrans->offset, &ttrans->staging, (void**)&map);
> + if (!map) {
> + slab_free(&tc->pool_transfers, ttrans);
> + return NULL;
> + }
> +
> + ttrans->b.resource = NULL;
> + pipe_resource_reference(&ttrans->b.resource, resource);
tc_set_resource_reference
> + ttrans->b.level = 0;
> + ttrans->b.usage = usage;
> + ttrans->b.box = *box;
> + ttrans->b.stride = 0;
> + ttrans->b.layer_stride = 0;
> + *transfer = &ttrans->b;
> + return map + (box->x % tc->map_buffer_alignment);
> + }
> + }
> +
> + /* Unsychronized buffer mappings don't have to synchronize the thread. */
> + if (!(usage & TC_TRANSFER_MAP_THREADED_UNSYNC))
> + tc_sync_msg(tc, resource->target != PIPE_BUFFER ? " texture" :
> + usage & PIPE_TRANSFER_DISCARD_RANGE ? " discard_range" :
> + usage & PIPE_TRANSFER_READ ? " read" : " ??");
> +
> + return pipe->transfer_map(pipe, tres->latest ? tres->latest : resource,
> + level, usage, box, transfer);
The ternary operator here should be unnecessary -- tres->latest should
always be non-NULL.
> +}
> +
> +struct tc_transfer_flush_region {
> + struct pipe_transfer *transfer;
> + struct pipe_box box;
> +};
> +
> +static void
> +tc_call_transfer_flush_region(struct pipe_context *pipe,
> + union tc_payload *payload)
> +{
> + struct tc_transfer_flush_region *p =
> + (struct tc_transfer_flush_region *)payload;
> +
> + pipe->transfer_flush_region(pipe, p->transfer, &p->box);
> +}
> +
> +struct tc_resource_copy_region {
> + struct pipe_resource *dst;
> + unsigned dst_level;
> + unsigned dstx, dsty, dstz;
> + struct pipe_resource *src;
> + unsigned src_level;
> + struct pipe_box src_box;
> +};
> +
> +static void
> +tc_resource_copy_region(struct pipe_context *_pipe,
> + struct pipe_resource *dst, unsigned dst_level,
> + unsigned dstx, unsigned dsty, unsigned dstz,
> + struct pipe_resource *src, unsigned src_level,
> + const struct pipe_box *src_box);
> +
> +static void
> +tc_buffer_do_flush_region(struct threaded_context *tc,
> + struct threaded_transfer *ttrans,
> + const struct pipe_box *box)
> +{
> + struct threaded_resource *tres = threaded_resource(ttrans->b.resource);
> +
> + if (ttrans->staging) {
> + struct pipe_box src_box;
> +
> + u_box_1d(ttrans->offset + box->x % tc->map_buffer_alignment,
> + box->width, &src_box);
> +
> + /* Copy the staging buffer into the original one. */
> + tc_resource_copy_region(&tc->base, ttrans->b.resource, 0, box->x, 0, 0,
> + ttrans->staging, 0, &src_box);
> + }
> +
> + util_range_add(tres->base_valid_buffer_range, box->x, box->x + box->width);
> +}
> +
> +static void
> +tc_transfer_flush_region(struct pipe_context *_pipe,
> + struct pipe_transfer *transfer,
> + const struct pipe_box *rel_box)
> +{
> + struct threaded_context *tc = threaded_context(_pipe);
> + struct threaded_transfer *ttrans = threaded_transfer(transfer);
> + struct threaded_resource *tres = threaded_resource(transfer->resource);
> + unsigned required_usage = PIPE_TRANSFER_WRITE |
> + PIPE_TRANSFER_FLUSH_EXPLICIT;
> +
> + if (tres->b.target == PIPE_BUFFER) {
> + if ((transfer->usage & required_usage) == required_usage) {
> + struct pipe_box box;
> +
> + u_box_1d(transfer->box.x + rel_box->x, rel_box->width, &box);
> + tc_buffer_do_flush_region(tc, ttrans, &box);
> + }
> +
> + /* Staging transfers don't send the call to the driver. */
> + if (ttrans->staging)
> + return;
> + }
> +
> + struct tc_transfer_flush_region *p =
> + tc_add_struct_typed_call(tc, TC_CALL_transfer_flush_region,
> + tc_transfer_flush_region);
> + p->transfer = transfer;
> + p->box = *rel_box;
> +}
> +
> +static void
> +tc_call_transfer_unmap(struct pipe_context *pipe, union tc_payload *payload)
> +{
> + pipe->transfer_unmap(pipe, payload->transfer);
> +}
> +
> +static void
> +tc_transfer_unmap(struct pipe_context *_pipe, struct pipe_transfer *transfer)
> +{
> + struct threaded_context *tc = threaded_context(_pipe);
> + struct threaded_transfer *ttrans = threaded_transfer(transfer);
> + struct threaded_resource *tres = threaded_resource(transfer->resource);
> +
> + if (tres->b.target == PIPE_BUFFER) {
> + if (transfer->usage & PIPE_TRANSFER_WRITE &&
> + !(transfer->usage & PIPE_TRANSFER_FLUSH_EXPLICIT))
> + tc_buffer_do_flush_region(tc, ttrans, &transfer->box);
> +
> + /* Staging transfers don't send the call to the driver. */
> + if (ttrans->staging) {
> + pipe_resource_reference(&ttrans->staging, NULL);
> + pipe_resource_reference(&ttrans->b.resource, NULL);
> + slab_free(&tc->pool_transfers, ttrans);
> + return;
> + }
> + }
> +
> + tc_add_small_call(tc, TC_CALL_transfer_unmap)->transfer = transfer;
> +}
> +
> +struct tc_buffer_subdata {
> + struct pipe_resource *resource;
> + unsigned usage, offset, size;
> + char slot[1]; /* more will be allocated if needed */
> +};
> +
> +static void
> +tc_call_buffer_subdata(struct pipe_context *pipe, union tc_payload *payload)
> +{
> + struct tc_buffer_subdata *p = (struct tc_buffer_subdata *)payload;
> +
> + pipe->buffer_subdata(pipe, p->resource, p->usage, p->offset, p->size,
> + p->slot);
> + pipe_resource_reference(&p->resource, NULL);
> +}
> +
> +static void
> +tc_buffer_subdata(struct pipe_context *_pipe,
> + struct pipe_resource *resource,
> + unsigned usage, unsigned offset,
> + unsigned size, const void *data)
> +{
> + struct threaded_context *tc = threaded_context(_pipe);
> + struct threaded_resource *tres = threaded_resource(resource);
> +
> + if (!size)
> + return;
> +
> + usage |= PIPE_TRANSFER_WRITE |
> + PIPE_TRANSFER_DISCARD_RANGE;
> +
> + usage = tc_improve_map_buffer_flags(tc, tres, usage, offset, size);
> +
> + /* Unsychronized and big transfers should use transfer_map. Also handle
> + * full invalidations, because drivers aren't allowed to do them.
> + */
> + if (usage & (PIPE_TRANSFER_UNSYNCHRONIZED |
> + PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE) ||
> + size > TC_MAX_SUBDATA_BYTES) {
> + struct pipe_transfer *transfer;
> + struct pipe_box box;
> + uint8_t *map = NULL;
> +
> + u_box_1d(offset, size, &box);
> +
> + map = tc_transfer_map(_pipe, resource, 0, usage, &box, &transfer);
> + if (map) {
> + memcpy(map, data, size);
> + tc_transfer_unmap(_pipe, transfer);
> + }
> + return;
> + }
> +
> + util_range_add(&tres->valid_buffer_range, offset, offset + size);
> +
> + /* The upload is small. Enqueue it. */
> + struct tc_buffer_subdata *p =
> + tc_add_slot_based_call(tc, TC_CALL_buffer_subdata, tc_buffer_subdata, size);
> +
> + tc_set_resource_reference(&p->resource, resource);
> + p->usage = usage;
> + p->offset = offset;
> + p->size = size;
> + memcpy(p->slot, data, size);
> +}
> +
> +struct tc_texture_subdata {
> + struct pipe_resource *resource;
> + unsigned level, usage, stride, layer_stride;
> + struct pipe_box box;
> + char slot[1]; /* more will be allocated if needed */
> +};
> +
> +static void
> +tc_call_texture_subdata(struct pipe_context *pipe, union tc_payload *payload)
> +{
> + struct tc_texture_subdata *p = (struct tc_texture_subdata *)payload;
> +
> + pipe->texture_subdata(pipe, p->resource, p->level, p->usage, &p->box,
> + p->slot, p->stride, p->layer_stride);
> + pipe_resource_reference(&p->resource, NULL);
> +}
> +
> +static void
> +tc_texture_subdata(struct pipe_context *_pipe,
> + struct pipe_resource *resource,
> + unsigned level, unsigned usage,
> + const struct pipe_box *box,
> + const void *data, unsigned stride,
> + unsigned layer_stride)
> +{
> + struct threaded_context *tc = threaded_context(_pipe);
> + unsigned size;
> +
> + assert(box->height >= 1);
> + assert(box->depth >= 1);
> +
> + size = (box->depth - 1) * layer_stride +
> + (box->height - 1) * stride +
> + box->width * util_format_get_blocksize(resource->format);
> + if (!size)
> + return;
> +
> + /* Small uploads can be enqueued, big uploads must sync. */
> + if (size <= TC_MAX_SUBDATA_BYTES) {
> + struct tc_texture_subdata *p =
> + tc_add_slot_based_call(tc, TC_CALL_texture_subdata, tc_texture_subdata, size);
> +
> + tc_set_resource_reference(&p->resource, resource);
> + p->level = level;
> + p->usage = usage;
> + p->box = *box;
> + p->stride = stride;
> + p->layer_stride = layer_stride;
> + memcpy(p->slot, data, size);
> + } else {
> + struct pipe_context *pipe = tc->pipe;
> +
> + tc_sync(tc);
> + pipe->texture_subdata(pipe, resource, level, usage, box, data,
> + stride, layer_stride);
> + }
> +}
> +
> +
> +/********************************************************************
> + * miscellaneous
> + */
> +
> +#define TC_FUNC_SYNC_RET0(ret_type, func) \
> + static ret_type \
> + tc_##func(struct pipe_context *_pipe) \
> + { \
> + struct threaded_context *tc = threaded_context(_pipe); \
> + struct pipe_context *pipe = tc->pipe; \
> + tc_sync(tc); \
> + return pipe->func(pipe); \
> + }
> +
> +TC_FUNC_SYNC_RET0(enum pipe_reset_status, get_device_reset_status)
> +TC_FUNC_SYNC_RET0(uint64_t, get_timestamp)
> +
> +static void
> +tc_get_sample_position(struct pipe_context *_pipe,
> + unsigned sample_count, unsigned sample_index,
> + float *out_value)
> +{
> + struct threaded_context *tc = threaded_context(_pipe);
> + struct pipe_context *pipe = tc->pipe;
> +
> + tc_sync(tc);
> + pipe->get_sample_position(pipe, sample_count, sample_index,
> + out_value);
> +}
> +
> +static void
> +tc_set_device_reset_callback(struct pipe_context *_pipe,
> + const struct pipe_device_reset_callback *cb)
> +{
> + struct threaded_context *tc = threaded_context(_pipe);
> + struct pipe_context *pipe = tc->pipe;
> +
> + tc_sync(tc);
> + pipe->set_device_reset_callback(pipe, cb);
> +}
> +
> +struct tc_string_marker {
> + int len;
> + char slot[1]; /* more will be allocated if needed */
> +};
> +
> +static void
> +tc_call_emit_string_marker(struct pipe_context *pipe, union tc_payload *payload)
> +{
> + struct tc_string_marker *p = (struct tc_string_marker *)payload;
> + pipe->emit_string_marker(pipe, p->slot, p->len);
> +}
> +
> +static void
> +tc_emit_string_marker(struct pipe_context *_pipe,
> + const char *string, int len)
> +{
> + struct threaded_context *tc = threaded_context(_pipe);
> +
> + if (len <= TC_MAX_STRING_MARKER_BYTES) {
> + struct tc_string_marker *p =
> + tc_add_slot_based_call(tc, TC_CALL_emit_string_marker, tc_string_marker, len);
> +
> + memcpy(p->slot, string, len);
> + p->len = len;
> + } else {
> + struct pipe_context *pipe = tc->pipe;
> +
> + tc_sync(tc);
> + pipe->emit_string_marker(pipe, string, len);
> + }
> +}
> +
> +static void
> +tc_dump_debug_state(struct pipe_context *_pipe, FILE *stream,
> + unsigned flags)
> +{
> + struct threaded_context *tc = threaded_context(_pipe);
> + struct pipe_context *pipe = tc->pipe;
> +
> + tc_sync(tc);
> + pipe->dump_debug_state(pipe, stream, flags);
> +}
> +
> +static void
> +tc_set_debug_callback(struct pipe_context *_pipe,
> + const struct pipe_debug_callback *cb)
> +{
> + struct threaded_context *tc = threaded_context(_pipe);
> + struct pipe_context *pipe = tc->pipe;
> +
> + tc_sync(tc);
> + pipe->set_debug_callback(pipe, cb);
> +}
> +
> +static void
> +tc_create_fence_fd(struct pipe_context *_pipe,
> + struct pipe_fence_handle **fence, int fd)
> +{
> + struct threaded_context *tc = threaded_context(_pipe);
> + struct pipe_context *pipe = tc->pipe;
> +
> + tc_sync(tc);
> + pipe->create_fence_fd(pipe, fence, fd);
> +}
> +
> +static void
> +tc_fence_server_sync(struct pipe_context *_pipe,
> + struct pipe_fence_handle *fence)
> +{
> + struct threaded_context *tc = threaded_context(_pipe);
> + struct pipe_context *pipe = tc->pipe;
> +
> + tc_sync(tc);
> + pipe->fence_server_sync(pipe, fence);
> +}
> +
> +static struct pipe_video_codec *
> +tc_create_video_codec(struct pipe_context *_pipe,
> + const struct pipe_video_codec *templ)
> +{
> + unreachable("Threaded context should not be enabled for video APIs");
> + return NULL;
> +}
> +
> +static struct pipe_video_buffer *
> +tc_create_video_buffer(struct pipe_context *_pipe,
> + const struct pipe_video_buffer *templ)
> +{
> + unreachable("Threaded context should not be enabled for video APIs");
> + return NULL;
> +}
> +
> +
> +/********************************************************************
> + * draw, launch, clear, blit, copy, flush
> + */
> +
> +static void
> +tc_flush(struct pipe_context *_pipe, struct pipe_fence_handle **fence,
> + unsigned flags)
> +{
> + struct threaded_context *tc = threaded_context(_pipe);
> + struct pipe_context *pipe = tc->pipe;
> + struct threaded_query *tq, *tmp;
> +
> + LIST_FOR_EACH_ENTRY_SAFE(tq, tmp, &tc->unflushed_queries, head_unflushed) {
> + tq->flushed = true;
> + LIST_DEL(&tq->head_unflushed);
> + }
> +
> + /* TODO: deferred flushes? */
> + tc_sync_msg(tc, flags & PIPE_FLUSH_END_OF_FRAME ? "end of frame" :
> + flags & PIPE_FLUSH_DEFERRED ? "deferred fence" : "normal");
> + pipe->flush(pipe, fence, flags);
> +}
> +
> +/* This is actually variable-sized, because indirect isn't allocated if it's
> + * not needed. */
> +struct tc_full_draw_info {
> + struct pipe_draw_info draw;
> + struct pipe_draw_indirect_info indirect;
> +};
> +
> +static void
> +tc_call_draw_vbo(struct pipe_context *pipe, union tc_payload *payload)
> +{
> + struct tc_full_draw_info *info = (struct tc_full_draw_info*)payload;
> +
> + pipe->draw_vbo(pipe, &info->draw);
> + pipe_so_target_reference(&info->draw.count_from_stream_output, NULL);
> + if (info->draw.index_size)
> + pipe_resource_reference(&info->draw.index.resource, NULL);
> + if (info->draw.indirect) {
> + pipe_resource_reference(&info->indirect.buffer, NULL);
> + pipe_resource_reference(&info->indirect.indirect_draw_count, NULL);
> + }
> +}
> +
> +static struct tc_full_draw_info *
> +tc_add_draw_vbo(struct pipe_context *_pipe, bool indirect)
> +{
> + return (struct tc_full_draw_info*)
> + tc_add_sized_call(threaded_context(_pipe), TC_CALL_draw_vbo,
> + indirect ? sizeof(struct tc_full_draw_info) :
> + sizeof(struct pipe_draw_info));
> +}
> +
> +static void
> +tc_draw_vbo(struct pipe_context *_pipe, const struct pipe_draw_info *info)
> +{
> + struct threaded_context *tc = threaded_context(_pipe);
> + struct pipe_draw_indirect_info *indirect = info->indirect;
> + unsigned index_size = info->index_size;
> + bool has_user_indices = info->has_user_indices;
> +
> + if (index_size && has_user_indices) {
> + unsigned size = info->count * index_size;
> + struct pipe_resource *buffer = NULL;
> + unsigned offset;
> +
> + tc_assert(!indirect);
> +
> + /* This must be done before adding draw_vbo, because it could generate
> + * e.g. transfer_unmap and flush partially-uninitialized draw_vbo
> + * to the driver if it was done afterwards.
> + */
> + u_upload_data(tc->base.stream_uploader, 0, size, 4, info->index.user,
> + &offset, &buffer);
> + if (unlikely(!buffer))
> + return;
> +
> + struct tc_full_draw_info *p = tc_add_draw_vbo(_pipe, indirect != NULL);
indirect is always NULL here.
> + p->draw.count_from_stream_output = NULL;
> + pipe_so_target_reference(&p->draw.count_from_stream_output,
> + info->count_from_stream_output);
> + if (index_size && !has_user_indices) {
This can never be true, because the if above checks has_user_indices.
> + tc_set_resource_reference(&p->draw.index.resource,
> + info->index.resource);
> + }
> + memcpy(&p->draw, info, sizeof(*info));
> + p->draw.has_user_indices = false;
> + p->draw.index.resource = buffer;
> + p->draw.start = offset / index_size;
> + } else {
> + /* Non-indexed call or indexed with a real index buffer. */
> + struct tc_full_draw_info *p = tc_add_draw_vbo(_pipe, indirect != NULL);
> + p->draw.count_from_stream_output = NULL;
> + pipe_so_target_reference(&p->draw.count_from_stream_output,
> + info->count_from_stream_output);
> + if (index_size) {
> + tc_set_resource_reference(&p->draw.index.resource,
> + info->index.resource);
> + }
> + memcpy(&p->draw, info, sizeof(*info));
> +
> + if (indirect) {
> + tc_set_resource_reference(&p->draw.indirect->buffer, indirect->buffer);
> + tc_set_resource_reference(&p->indirect.indirect_draw_count,
> + indirect->indirect_draw_count);
> + memcpy(&p->indirect, indirect, sizeof(*indirect));
> + p->draw.indirect = &p->indirect;
> + }
> + }
> +}
> +
> +static void
> +tc_call_launch_grid(struct pipe_context *pipe, union tc_payload *payload)
> +{
> + struct pipe_grid_info *p = (struct pipe_grid_info *)payload;
> +
> + pipe->launch_grid(pipe, p);
> + pipe_resource_reference(&p->indirect, NULL);
> +}
> +
> +static void
> +tc_launch_grid(struct pipe_context *_pipe,
> + const struct pipe_grid_info *info)
> +{
> + struct threaded_context *tc = threaded_context(_pipe);
> + struct pipe_grid_info *p = tc_add_struct_typed_call(tc, TC_CALL_launch_grid,
> + pipe_grid_info);
> + assert(info->input == NULL);
> +
> + tc_set_resource_reference(&p->indirect, info->indirect);
> + memcpy(p, info, sizeof(*info));
> +}
> +
> +static void
> +tc_call_resource_copy_region(struct pipe_context *pipe, union tc_payload *payload)
> +{
> + struct tc_resource_copy_region *p = (struct tc_resource_copy_region *)payload;
> +
> + pipe->resource_copy_region(pipe, p->dst, p->dst_level, p->dstx, p->dsty,
> + p->dstz, p->src, p->src_level, &p->src_box);
> + pipe_resource_reference(&p->dst, NULL);
> + pipe_resource_reference(&p->src, NULL);
> +}
> +
> +static void
> +tc_resource_copy_region(struct pipe_context *_pipe,
> + struct pipe_resource *dst, unsigned dst_level,
> + unsigned dstx, unsigned dsty, unsigned dstz,
> + struct pipe_resource *src, unsigned src_level,
> + const struct pipe_box *src_box)
> +{
> + struct threaded_context *tc = threaded_context(_pipe);
> + struct threaded_resource *tdst = threaded_resource(dst);
> + struct tc_resource_copy_region *p =
> + tc_add_struct_typed_call(tc, TC_CALL_resource_copy_region,
> + tc_resource_copy_region);
> +
> + tc_set_resource_reference(&p->dst, dst);
> + p->dst_level = dst_level;
> + p->dstx = dstx;
> + p->dsty = dsty;
> + p->dstz = dstz;
> + tc_set_resource_reference(&p->src, src);
> + p->src_level = src_level;
> + p->src_box = *src_box;
> +
> + if (dst->target == PIPE_BUFFER)
> + util_range_add(&tdst->valid_buffer_range, dstx, dstx + src_box->width);
> +}
> +
> +static void
> +tc_call_blit(struct pipe_context *pipe, union tc_payload *payload)
> +{
> + struct pipe_blit_info *blit = (struct pipe_blit_info*)payload;
> +
> + pipe->blit(pipe, blit);
> + pipe_resource_reference(&blit->dst.resource, NULL);
> + pipe_resource_reference(&blit->src.resource, NULL);
> +}
> +
> +static void
> +tc_blit(struct pipe_context *_pipe, const struct pipe_blit_info *info)
> +{
> + struct threaded_context *tc = threaded_context(_pipe);
> + struct pipe_blit_info *blit =
> + tc_add_struct_typed_call(tc, TC_CALL_blit, pipe_blit_info);
> +
> + tc_set_resource_reference(&blit->dst.resource, info->dst.resource);
> + tc_set_resource_reference(&blit->src.resource, info->src.resource);
> + memcpy(blit, info, sizeof(*info));
> +}
> +
> +struct tc_generate_mipmap {
> + struct pipe_resource *res;
> + enum pipe_format format;
> + unsigned base_level;
> + unsigned last_level;
> + unsigned first_layer;
> + unsigned last_layer;
> +};
> +
> +static void
> +tc_call_generate_mipmap(struct pipe_context *pipe, union tc_payload *payload)
> +{
> + struct tc_generate_mipmap *p = (struct tc_generate_mipmap *)payload;
> + bool result = pipe->generate_mipmap(pipe, p->res, p->format, p->base_level,
> + p->last_level, p->first_layer,
> + p->last_layer);
> + assert(result);
> + pipe_resource_reference(&p->res, NULL);
> +}
> +
> +static boolean
> +tc_generate_mipmap(struct pipe_context *_pipe,
> + struct pipe_resource *res,
> + enum pipe_format format,
> + unsigned base_level,
> + unsigned last_level,
> + unsigned first_layer,
> + unsigned last_layer)
> +{
> + struct threaded_context *tc = threaded_context(_pipe);
> + struct pipe_context *pipe = tc->pipe;
> + struct pipe_screen *screen = pipe->screen;
> + unsigned bind = PIPE_BIND_SAMPLER_VIEW;
> +
> + if (util_format_is_depth_or_stencil(format))
> + bind = PIPE_BIND_DEPTH_STENCIL;
> + else
> + bind = PIPE_BIND_RENDER_TARGET;
> +
> + if (!screen->is_format_supported(screen, format, res->target,
> + res->nr_samples, bind))
> + return false;
This feels like the kind of thing the state tracker should be checking
before it calls this function...
Cheers,
Nicolai
> +
> + struct tc_generate_mipmap *p =
> + tc_add_struct_typed_call(tc, TC_CALL_generate_mipmap, tc_generate_mipmap);
> +
> + tc_set_resource_reference(&p->res, res);
> + p->format = format;
> + p->base_level = base_level;
> + p->last_level = last_level;
> + p->first_layer = first_layer;
> + p->last_layer = last_layer;
> + return true;
> +}
> +
> +static void
> +tc_call_flush_resource(struct pipe_context *pipe, union tc_payload *payload)
> +{
> + pipe->flush_resource(pipe, payload->resource);
> + pipe_resource_reference(&payload->resource, NULL);
> +}
> +
> +static void
> +tc_flush_resource(struct pipe_context *_pipe,
> + struct pipe_resource *resource)
> +{
> + struct threaded_context *tc = threaded_context(_pipe);
> + union tc_payload *payload = tc_add_small_call(tc, TC_CALL_flush_resource);
> +
> + tc_set_resource_reference(&payload->resource, resource);
> +}
> +
> +static void
> +tc_call_invalidate_resource(struct pipe_context *pipe, union tc_payload *payload)
> +{
> + pipe->invalidate_resource(pipe, payload->resource);
> + pipe_resource_reference(&payload->resource, NULL);
> +}
> +
> +static void
> +tc_invalidate_resource(struct pipe_context *_pipe,
> + struct pipe_resource *resource)
> +{
> + struct threaded_context *tc = threaded_context(_pipe);
> +
> + if (resource->target == PIPE_BUFFER) {
> + tc_invalidate_buffer(tc, threaded_resource(resource));
> + return;
> + }
> +
> + union tc_payload *payload = tc_add_small_call(tc, TC_CALL_invalidate_resource);
> + tc_set_resource_reference(&payload->resource, resource);
> +}
> +
> +struct tc_clear {
> + unsigned buffers;
> + union pipe_color_union color;
> + double depth;
> + unsigned stencil;
> +};
> +
> +static void
> +tc_call_clear(struct pipe_context *pipe, union tc_payload *payload)
> +{
> + struct tc_clear *p = (struct tc_clear *)payload;
> + pipe->clear(pipe, p->buffers, &p->color, p->depth, p->stencil);
> +}
> +
> +static void
> +tc_clear(struct pipe_context *_pipe, unsigned buffers,
> + const union pipe_color_union *color, double depth,
> + unsigned stencil)
> +{
> + struct threaded_context *tc = threaded_context(_pipe);
> + struct tc_clear *p = tc_add_struct_typed_call(tc, TC_CALL_clear, tc_clear);
> +
> + p->buffers = buffers;
> + p->color = *color;
> + p->depth = depth;
> + p->stencil = stencil;
> +}
> +
> +static void
> +tc_clear_render_target(struct pipe_context *_pipe,
> + struct pipe_surface *dst,
> + const union pipe_color_union *color,
> + unsigned dstx, unsigned dsty,
> + unsigned width, unsigned height,
> + bool render_condition_enabled)
> +{
> + struct threaded_context *tc = threaded_context(_pipe);
> + struct pipe_context *pipe = tc->pipe;
> +
> + tc_sync(tc);
> + pipe->clear_render_target(pipe, dst, color, dstx, dsty, width, height,
> + render_condition_enabled);
> +}
> +
> +static void
> +tc_clear_depth_stencil(struct pipe_context *_pipe,
> + struct pipe_surface *dst, unsigned clear_flags,
> + double depth, unsigned stencil, unsigned dstx,
> + unsigned dsty, unsigned width, unsigned height,
> + bool render_condition_enabled)
> +{
> + struct threaded_context *tc = threaded_context(_pipe);
> + struct pipe_context *pipe = tc->pipe;
> +
> + tc_sync(tc);
> + pipe->clear_depth_stencil(pipe, dst, clear_flags, depth, stencil,
> + dstx, dsty, width, height,
> + render_condition_enabled);
> +}
> +
> +struct tc_clear_buffer {
> + struct pipe_resource *res;
> + unsigned offset;
> + unsigned size;
> + char clear_value[16];
> + int clear_value_size;
> +};
> +
> +static void
> +tc_call_clear_buffer(struct pipe_context *pipe, union tc_payload *payload)
> +{
> + struct tc_clear_buffer *p = (struct tc_clear_buffer *)payload;
> +
> + pipe->clear_buffer(pipe, p->res, p->offset, p->size, p->clear_value,
> + p->clear_value_size);
> + pipe_resource_reference(&p->res, NULL);
> +}
> +
> +static void
> +tc_clear_buffer(struct pipe_context *_pipe, struct pipe_resource *res,
> + unsigned offset, unsigned size,
> + const void *clear_value, int clear_value_size)
> +{
> + struct threaded_context *tc = threaded_context(_pipe);
> + struct threaded_resource *tres = threaded_resource(res);
> + struct tc_clear_buffer *p =
> + tc_add_struct_typed_call(tc, TC_CALL_clear_buffer, tc_clear_buffer);
> +
> + tc_set_resource_reference(&p->res, res);
> + p->offset = offset;
> + p->size = size;
> + memcpy(p->clear_value, clear_value, clear_value_size);
> + p->clear_value_size = clear_value_size;
> +
> + util_range_add(&tres->valid_buffer_range, offset, offset + size);
> +}
> +
> +struct tc_clear_texture {
> + struct pipe_resource *res;
> + unsigned level;
> + struct pipe_box box;
> + char data[16];
> +};
> +
> +static void
> +tc_call_clear_texture(struct pipe_context *pipe, union tc_payload *payload)
> +{
> + struct tc_clear_texture *p = (struct tc_clear_texture *)payload;
> +
> + pipe->clear_texture(pipe, p->res, p->level, &p->box, p->data);
> + pipe_resource_reference(&p->res, NULL);
> +}
> +
> +static void
> +tc_clear_texture(struct pipe_context *_pipe, struct pipe_resource *res,
> + unsigned level, const struct pipe_box *box, const void *data)
> +{
> + struct threaded_context *tc = threaded_context(_pipe);
> + struct tc_clear_texture *p =
> + tc_add_struct_typed_call(tc, TC_CALL_clear_texture, tc_clear_texture);
> +
> + tc_set_resource_reference(&p->res, res);
> + p->level = level;
> + p->box = *box;
> + memcpy(p->data, data,
> + util_format_get_blocksize(res->format));
> +}
> +
> +struct tc_resource_commit {
> + struct pipe_resource *res;
> + unsigned level;
> + struct pipe_box box;
> + bool commit;
> +};
> +
> +static void
> +tc_call_resource_commit(struct pipe_context *pipe, union tc_payload *payload)
> +{
> + struct tc_resource_commit *p = (struct tc_resource_commit *)payload;
> +
> + pipe->resource_commit(pipe, p->res, p->level, &p->box, p->commit);
> + pipe_resource_reference(&p->res, NULL);
> +}
> +
> +static bool
> +tc_resource_commit(struct pipe_context *_pipe, struct pipe_resource *res,
> + unsigned level, struct pipe_box *box, bool commit)
> +{
> + struct threaded_context *tc = threaded_context(_pipe);
> + struct tc_resource_commit *p =
> + tc_add_struct_typed_call(tc, TC_CALL_resource_commit, tc_resource_commit);
> +
> + tc_set_resource_reference(&p->res, res);
> + p->level = level;
> + p->box = *box;
> + p->commit = commit;
> + return true; /* we don't care about the return value for this call */
> +}
> +
> +
> +/********************************************************************
> + * create & destroy
> + */
> +
> +static void
> +tc_destroy(struct pipe_context *_pipe)
> +{
> + struct threaded_context *tc = threaded_context(_pipe);
> + struct pipe_context *pipe = tc->pipe;
> +
> + tc_sync(tc);
> +
> + if (util_queue_is_initialized(&tc->queue)) {
> + util_queue_destroy(&tc->queue);
> +
> + for (unsigned i = 0; i < TC_MAX_BATCHES; i++)
> + util_queue_fence_destroy(&tc->batch_slots[i].fence);
> + }
> +
> + if (tc->base.const_uploader &&
> + tc->base.stream_uploader != tc->base.const_uploader)
> + u_upload_destroy(tc->base.const_uploader);
> +
> + if (tc->base.stream_uploader)
> + u_upload_destroy(tc->base.stream_uploader);
> +
> + slab_destroy_child(&tc->pool_transfers);
> + pipe->destroy(pipe);
> + FREE(tc);
> +}
> +
> +static const tc_execute execute_func[TC_NUM_CALLS] = {
> +#define CALL(name) tc_call_##name,
> +#include "u_threaded_context_calls.h"
> +#undef CALL
> +};
> +
> +/**
> + * Wrap an existing pipe_context into a threaded_context.
> + *
> + * \param pipe pipe_context to wrap
> + * \param parent_transfer_pool parent slab pool set up for creating pipe_-
> + * transfer objects; the driver should have one
> + * in pipe_screen.
> + * \param replace_buffer callback for replacing a pipe_resource's storage
> + * with another pipe_resource's storage.
> + * \param out if successful, the threaded_context will be returned here in
> + * addition to the return value if "out" != NULL
> + */
> +struct pipe_context *
> +threaded_context_create(struct pipe_context *pipe,
> + struct slab_parent_pool *parent_transfer_pool,
> + tc_replace_buffer_storage_func replace_buffer,
> + struct threaded_context **out)
> +{
> + struct threaded_context *tc;
> +
> + STATIC_ASSERT(sizeof(union tc_payload) <= 8);
> + STATIC_ASSERT(sizeof(struct tc_call) <= 16);
> +
> + if (!pipe)
> + return NULL;
> +
> + util_cpu_detect();
> +
> + if (!debug_get_bool_option("GALLIUM_THREAD", util_cpu_caps.nr_cpus > 1))
> + return pipe;
> +
> + tc = CALLOC_STRUCT(threaded_context);
> + if (!tc) {
> + pipe->destroy(pipe);
> + return NULL;
> + }
> +
> + /* The driver context isn't wrapped, so set its "priv" to NULL. */
> + pipe->priv = NULL;
> +
> + tc->pipe = pipe;
> + tc->replace_buffer_storage = replace_buffer;
> + tc->map_buffer_alignment =
> + pipe->screen->get_param(pipe->screen, PIPE_CAP_MIN_MAP_BUFFER_ALIGNMENT);
> + tc->base.priv = pipe; /* priv points to the wrapped driver context */
> + tc->base.screen = pipe->screen;
> + tc->base.destroy = tc_destroy;
> +
> + tc->base.stream_uploader = u_upload_clone(&tc->base, pipe->stream_uploader);
> + if (pipe->stream_uploader == pipe->const_uploader)
> + tc->base.const_uploader = tc->base.stream_uploader;
> + else
> + tc->base.const_uploader = u_upload_clone(&tc->base, pipe->const_uploader);
> +
> + if (!tc->base.stream_uploader || !tc->base.const_uploader)
> + goto fail;
> +
> + /* The queue size is the number of batches "waiting". Batches are removed
> + * from the queue before being executed, so keep one tc_batch slot for that
> + * execution. Also, keep one unused slot for an unflushed batch.
> + */
> + if (!util_queue_init(&tc->queue, "gallium_drv", TC_MAX_BATCHES - 2, 1))
> + goto fail;
> +
> + for (unsigned i = 0; i < TC_MAX_BATCHES; i++) {
> + tc->batch_slots[i].sentinel = TC_SENTINEL;
> + tc->batch_slots[i].sentinel2 = TC_SENTINEL;
> + tc->batch_slots[i].pipe = pipe;
> + util_queue_fence_init(&tc->batch_slots[i].fence);
> + }
> +
> + LIST_INITHEAD(&tc->unflushed_queries);
> +
> + slab_create_child(&tc->pool_transfers, parent_transfer_pool);
> +
> +#define CTX_INIT(_member) \
> + tc->base._member = tc->pipe->_member ? tc_##_member : NULL
> +
> + CTX_INIT(flush);
> + CTX_INIT(draw_vbo);
> + CTX_INIT(launch_grid);
> + CTX_INIT(resource_copy_region);
> + CTX_INIT(blit);
> + CTX_INIT(clear);
> + CTX_INIT(clear_render_target);
> + CTX_INIT(clear_depth_stencil);
> + CTX_INIT(clear_buffer);
> + CTX_INIT(clear_texture);
> + CTX_INIT(flush_resource);
> + CTX_INIT(generate_mipmap);
> + CTX_INIT(render_condition);
> + CTX_INIT(create_query);
> + CTX_INIT(create_batch_query);
> + CTX_INIT(destroy_query);
> + CTX_INIT(begin_query);
> + CTX_INIT(end_query);
> + CTX_INIT(get_query_result);
> + CTX_INIT(get_query_result_resource);
> + CTX_INIT(set_active_query_state);
> + CTX_INIT(create_blend_state);
> + CTX_INIT(bind_blend_state);
> + CTX_INIT(delete_blend_state);
> + CTX_INIT(create_sampler_state);
> + CTX_INIT(bind_sampler_states);
> + CTX_INIT(delete_sampler_state);
> + CTX_INIT(create_rasterizer_state);
> + CTX_INIT(bind_rasterizer_state);
> + CTX_INIT(delete_rasterizer_state);
> + CTX_INIT(create_depth_stencil_alpha_state);
> + CTX_INIT(bind_depth_stencil_alpha_state);
> + CTX_INIT(delete_depth_stencil_alpha_state);
> + CTX_INIT(create_fs_state);
> + CTX_INIT(bind_fs_state);
> + CTX_INIT(delete_fs_state);
> + CTX_INIT(create_vs_state);
> + CTX_INIT(bind_vs_state);
> + CTX_INIT(delete_vs_state);
> + CTX_INIT(create_gs_state);
> + CTX_INIT(bind_gs_state);
> + CTX_INIT(delete_gs_state);
> + CTX_INIT(create_tcs_state);
> + CTX_INIT(bind_tcs_state);
> + CTX_INIT(delete_tcs_state);
> + CTX_INIT(create_tes_state);
> + CTX_INIT(bind_tes_state);
> + CTX_INIT(delete_tes_state);
> + CTX_INIT(create_compute_state);
> + CTX_INIT(bind_compute_state);
> + CTX_INIT(delete_compute_state);
> + CTX_INIT(create_vertex_elements_state);
> + CTX_INIT(bind_vertex_elements_state);
> + CTX_INIT(delete_vertex_elements_state);
> + CTX_INIT(set_blend_color);
> + CTX_INIT(set_stencil_ref);
> + CTX_INIT(set_sample_mask);
> + CTX_INIT(set_min_samples);
> + CTX_INIT(set_clip_state);
> + CTX_INIT(set_constant_buffer);
> + CTX_INIT(set_framebuffer_state);
> + CTX_INIT(set_polygon_stipple);
> + CTX_INIT(set_scissor_states);
> + CTX_INIT(set_viewport_states);
> + CTX_INIT(set_window_rectangles);
> + CTX_INIT(set_sampler_views);
> + CTX_INIT(set_tess_state);
> + CTX_INIT(set_shader_buffers);
> + CTX_INIT(set_shader_images);
> + CTX_INIT(set_vertex_buffers);
> + CTX_INIT(create_stream_output_target);
> + CTX_INIT(stream_output_target_destroy);
> + CTX_INIT(set_stream_output_targets);
> + CTX_INIT(create_sampler_view);
> + CTX_INIT(sampler_view_destroy);
> + CTX_INIT(create_surface);
> + CTX_INIT(surface_destroy);
> + CTX_INIT(transfer_map);
> + CTX_INIT(transfer_flush_region);
> + CTX_INIT(transfer_unmap);
> + CTX_INIT(buffer_subdata);
> + CTX_INIT(texture_subdata);
> + CTX_INIT(texture_barrier);
> + CTX_INIT(memory_barrier);
> + CTX_INIT(resource_commit);
> + CTX_INIT(create_video_codec);
> + CTX_INIT(create_video_buffer);
> + CTX_INIT(set_compute_resources);
> + CTX_INIT(set_global_binding);
> + CTX_INIT(get_sample_position);
> + CTX_INIT(invalidate_resource);
> + CTX_INIT(get_device_reset_status);
> + CTX_INIT(set_device_reset_callback);
> + CTX_INIT(dump_debug_state);
> + CTX_INIT(emit_string_marker);
> + CTX_INIT(set_debug_callback);
> + CTX_INIT(create_fence_fd);
> + CTX_INIT(fence_server_sync);
> + CTX_INIT(get_timestamp);
> +#undef CTX_INIT
> +
> + if (out)
> + *out = tc;
> +
> + return &tc->base;
> +
> +fail:
> + tc_destroy(&tc->base);
> + return NULL;
> +}
> diff --git a/src/gallium/auxiliary/util/u_threaded_context.h b/src/gallium/auxiliary/util/u_threaded_context.h
> new file mode 100644
> index 0000000..1485bf3
> --- /dev/null
> +++ b/src/gallium/auxiliary/util/u_threaded_context.h
> @@ -0,0 +1,349 @@
> +/**************************************************************************
> + *
> + * Copyright 2017 Advanced Micro Devices, Inc.
> + * All Rights Reserved.
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * on the rights to use, copy, modify, merge, publish, distribute, sub
> + * license, and/or sell copies of the Software, and to permit persons to whom
> + * the Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice (including the next
> + * paragraph) shall be included in all copies or substantial portions of the
> + * Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
> + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
> + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
> + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
> + * USE OR OTHER DEALINGS IN THE SOFTWARE.
> + *
> + **************************************************************************/
> +
> +/* This is a wrapper for pipe_context that executes all pipe_context calls
> + * in another thread.
> + *
> + *
> + * Guidelines for adopters and deviations from Gallium
> + * ---------------------------------------------------
> + *
> + * 1) pipe_context is wrapped. pipe_screen isn't wrapped. All pipe_screen
> + * driver functions that take a context (fence_finish, texture_get_handle)
> + * should manually unwrap pipe_context by doing:
> + * pipe = threaded_context_unwrap_sync(pipe);
> + *
> + * pipe_context::priv is used to unwrap the context, so drivers and state
> + * trackers shouldn't use it.
> + *
> + * No other objects are wrapped.
> + *
> + * 2) Drivers must subclass and initialize these structures:
> + * - threaded_resource for pipe_resource (use threaded_resource_init/deinit)
> + * - threaded_query for pipe_query (zero memory)
> + * - threaded_transfer for pipe_transfer (zero memory)
> + *
> + * 3) The threaded context must not be enabled for contexts that can use video
> + * codecs.
> + *
> + * 4) Changes in driver behavior:
> + * - begin_query and end_query always return true; return values from
> + * the driver are ignored.
> + * - generate_mipmap uses is_format_supported to determine success;
> + * the return value from the driver is ignored.
> + * - resource_commit always returns true; failures are ignored.
> + * - If a non-async debug callback is set, the threaded context keeps using
> + * asynchronous execution. This is OK for shader-db, but the driver
> + * shouldn't use the debug callback in any other way.
> + *
> + *
> + * Thread-safety requirements on context functions
> + * -----------------------------------------------
> + *
> + * These pipe_context functions are executed directly, so they shouldn't use
> + * pipe_context in an unsafe way. They are de-facto screen functions now:
> + * - create_query
> + * - create_batch_query
> + * - create_*_state (all CSOs and shaders)
> + * - Make sure the shader compiler doesn't use any per-context stuff.
> + * (e.g. LLVM target machine)
> + * - Only pipe_context's debug callback for shader dumps is guaranteed to
> + * be up to date, because set_debug_callback synchronizes execution.
> + * - create_surface
> + * - surface_destroy
> + * - create_sampler_view
> + * - sampler_view_destroy
> + * - stream_output_target_destroy
> + * - transfer_map (only unsychronized buffer mappings)
> + * - get_query_result (when threaded_query::flushed == true)
> + *
> + * Create calls causing a sync that can't be async due to driver limitations:
> + * - create_stream_output_target
> + *
> + *
> + * Transfer_map rules for buffer mappings
> + * --------------------------------------
> + *
> + * 1) If transfer_map has PIPE_TRANSFER_UNSYNCHRONIZED, the call is made
> + * in the non-driver thread without flushing the queue. The driver will
> + * receive TC_TRANSFER_MAP_THREADED_UNSYNC in addition to PIPE_TRANSFER_-
> + * UNSYNCHRONIZED to indicate this.
> + * Note that transfer_unmap is always enqueued and called from the driver
> + * thread.
> + *
> + * 2) The driver isn't allowed to infer unsychronized mappings by tracking
> + * the valid buffer range. The threaded context always sends TC_TRANSFER_-
> + * MAP_IGNORE_VALID_RANGE to indicate this. Ignoring the flag will lead
> + * to failures.
> + * The threaded context does its own detection of unsynchronized mappings.
> + *
> + * 3) The driver isn't allowed to do buffer invalidations by itself under any
> + * circumstances. This is necessary for unsychronized maps to map the latest
> + * version of the buffer. (because invalidations can be queued, while
> + * unsychronized maps are not queued and they should return the latest
> + * storage after invalidation). The threaded context always sends
> + * TC_TRANSFER_MAP_NO_INVALIDATE into transfer_map and buffer_subdata to
> + * indicate this. Ignoring the flag will lead to failures.
> + * The threaded context uses its own buffer invalidation mechanism.
> + *
> + *
> + * Additional requirements
> + * -----------------------
> + *
> + * get_query_result:
> + * If threaded_query::flushed == true, get_query_result should assume that
> + * it's called from a non-driver thread, in which case the driver shouldn't
> + * use the context in an unsafe way.
> + *
> + * replace_buffer_storage:
> + * The driver has to implement this callback, which will be called when
> + * the threaded context wants to replace a resource's backing storage with
> + * another resource's backing storage. The threaded context uses it to
> + * implement buffer invalidation. This call is always queued.
> + *
> + *
> + * Performance gotchas
> + * -------------------
> + *
> + * Buffer invalidations are done unconditionally - they don't check whether
> + * the buffer is busy. This can cause drivers to have more live allocations
> + * and CPU mappings than necessary.
> + *
> + *
> + * How it works (queue architecture)
> + * ---------------------------------
> + *
> + * There is a multithreaded queue consisting of batches, each batch consisting
> + * of call slots. Each call slot consists of an 8-byte header (call ID +
> + * call size + constant 32-bit marker for integrity checking) and an 8-byte
> + * body for per-call data. That is 16 bytes per call slot.
> + *
> + * Simple calls such as bind_xx_state(CSO) occupy only one call slot. Bigger
> + * calls occupy multiple call slots depending on the size needed by call
> + * parameters. That means that calls can have a variable size in the batch.
> + * For example, set_vertex_buffers(count = any, buffers = NULL) occupies only
> + * 1 call slot, but set_vertex_buffers(count = 5) occupies 6 call slots.
> + * Even though the first call slot can use only 8 bytes for data, additional
> + * call slots used by the same call can use all 16 bytes for data.
> + * For example, a call using 2 call slots has 24 bytes of space for data.
> + *
> + * Once a batch is full and there is no space for the next call, it's flushed,
> + * meaning that it's added to the queue for execution in the other thread.
> + * The batches are ordered in a ring and reused once they are idle again.
> + * The batching is necessary for low queue/mutex overhead.
> + *
> + */
> +
> +#ifndef U_THREADED_CONTEXT_H
> +#define U_THREADED_CONTEXT_H
> +
> +#include "pipe/p_context.h"
> +#include "pipe/p_state.h"
> +#include "util/u_queue.h"
> +#include "util/u_range.h"
> +#include "util/slab.h"
> +
> +/* These are transfer flags sent to drivers. */
> +/* Never infer whether it's safe to use unsychronized mappings: */
> +#define TC_TRANSFER_MAP_IGNORE_VALID_RANGE (1u << 29)
> +/* Don't invalidate buffers: */
> +#define TC_TRANSFER_MAP_NO_INVALIDATE (1u << 30)
> +/* transfer_map is called from a non-driver thread: */
> +#define TC_TRANSFER_MAP_THREADED_UNSYNC (1u << 31)
> +
> +/* Size of the queue = number of batch slots in memory.
> + * - 1 batch is always idle and records new commands
> + * - 1 batch is being executed
> + * so the queue size is TC_MAX_BATCHES - 2 = number of waiting batches.
> + *
> + * Use a size as small as possible for low CPU L2 cache usage but large enough
> + * so that the queue isn't stalled too often for not having enough idle batch
> + * slots.
> + */
> +#define TC_MAX_BATCHES 10
> +
> +/* The size of one batch. Non-trivial calls (i.e. not setting a CSO pointer)
> + * can occupy multiple call slots.
> + *
> + * The idea is to have batches as small as possible but large enough so that
> + * the queuing and mutex overhead is negligible.
> + */
> +#define TC_CALLS_PER_BATCH 192
> +
> +/* Threshold for when to use the queue or sync. */
> +#define TC_MAX_STRING_MARKER_BYTES 512
> +
> +/* Threshold for when to enqueue buffer/texture_subdata as-is.
> + * If the upload size is greater than this, it will do instead:
> + * - for buffers: DISCARD_RANGE is done by the threaded context
> + * - for textures: sync and call the driver directly
> + */
> +#define TC_MAX_SUBDATA_BYTES 320
> +
> +typedef void (*tc_replace_buffer_storage_func)(struct pipe_context *ctx,
> + struct pipe_resource *dst,
> + struct pipe_resource *src);
> +
> +struct threaded_resource {
> + struct pipe_resource b;
> + const struct u_resource_vtbl *vtbl;
> +
> + /* Since buffer invalidations are queued, we can't use the base resource
> + * for unsychronized mappings. This points to the latest version of
> + * the buffer after the latest invalidation. It's only used for unsychro-
> + * nized mappings in the non-driver thread. Initially it's set to &b.
> + */
> + struct pipe_resource *latest;
> +
> + /* The buffer range which is initialized (with a write transfer, streamout,
> + * or writable shader resources). The remainder of the buffer is considered
> + * invalid and can be mapped unsynchronized.
> + *
> + * This allows unsychronized mapping of a buffer range which hasn't been
> + * used yet. It's for applications which forget to use the unsynchronized
> + * map flag and expect the driver to figure it out.
> + *
> + * Drivers should set this to the full range for buffers backed by user
> + * memory.
> + */
> + struct util_range valid_buffer_range;
> +
> + /* If "this" is not the base instance of the buffer, but it's one of its
> + * reallocations (set in "latest" of the base instance), this points to
> + * the valid range of the base instance. It's used for transfers after
> + * a buffer invalidation, because such transfers operate on "latest", not
> + * the base instance. Initially it's set to &valid_buffer_range.
> + */
> + struct util_range *base_valid_buffer_range;
> +
> + /* Drivers are required to update this for shared resources and user
> + * pointers. */
> + bool is_shared;
> + bool is_user_ptr;
> +};
> +
> +struct threaded_transfer {
> + struct pipe_transfer b;
> +
> + /* Staging buffer for DISCARD_RANGE transfers. */
> + struct pipe_resource *staging;
> +
> + /* Offset into the staging buffer, because the backing buffer is
> + * sub-allocated. */
> + unsigned offset;
> +};
> +
> +struct threaded_query {
> + /* The query is added to the list in end_query and removed in flush. */
> + struct list_head head_unflushed;
> +
> + /* Whether pipe->flush has been called after end_query. */
> + bool flushed;
> +};
> +
> +/* This is the second half of tc_call containing call data.
> + * Most calls will typecast this to the type they need, typically larger
> + * than 8 bytes.
> + */
> +union tc_payload {
> + struct pipe_query *query;
> + struct pipe_resource *resource;
> + struct pipe_transfer *transfer;
> + uint64_t __use_8_bytes;
> +};
> +
> +struct tc_call {
> + unsigned sentinel;
> + ushort num_call_slots;
> + ushort call_id;
> + union tc_payload payload;
> +};
> +
> +struct tc_batch {
> + struct pipe_context *pipe;
> + unsigned sentinel;
> + unsigned num_calls;
> + struct util_queue_fence fence;
> + struct tc_call call[TC_CALLS_PER_BATCH];
> + unsigned sentinel2;
> +};
> +
> +struct threaded_context {
> + struct pipe_context base;
> + struct pipe_context *pipe;
> + struct slab_child_pool pool_transfers;
> + tc_replace_buffer_storage_func replace_buffer_storage;
> + unsigned map_buffer_alignment;
> +
> + struct list_head unflushed_queries;
> +
> + /* Counters for the HUD. */
> + unsigned num_offloaded_calls;
> + unsigned num_direct_calls;
> + unsigned num_syncs;
> +
> + struct util_queue queue;
> + struct util_queue_fence *fence;
> +
> + unsigned last, next;
> + struct tc_batch batch_slots[TC_MAX_BATCHES];
> +};
> +
> +void threaded_resource_init(struct pipe_resource *res);
> +void threaded_resource_deinit(struct pipe_resource *res);
> +struct pipe_context *threaded_context_unwrap_sync(struct pipe_context *pipe);
> +
> +struct pipe_context *
> +threaded_context_create(struct pipe_context *pipe,
> + struct slab_parent_pool *parent_transfer_pool,
> + tc_replace_buffer_storage_func replace_buffer,
> + struct threaded_context **out);
> +
> +static inline struct threaded_context *
> +threaded_context(struct pipe_context *pipe)
> +{
> + return (struct threaded_context*)pipe;
> +}
> +
> +static inline struct threaded_resource *
> +threaded_resource(struct pipe_resource *res)
> +{
> + return (struct threaded_resource*)res;
> +}
> +
> +static inline struct threaded_query *
> +threaded_query(struct pipe_query *q)
> +{
> + return (struct threaded_query*)q;
> +}
> +
> +static inline struct threaded_transfer *
> +threaded_transfer(struct pipe_transfer *transfer)
> +{
> + return (struct threaded_transfer*)transfer;
> +}
> +
> +#endif
> diff --git a/src/gallium/auxiliary/util/u_threaded_context_calls.h b/src/gallium/auxiliary/util/u_threaded_context_calls.h
> new file mode 100644
> index 0000000..7dfccb0
> --- /dev/null
> +++ b/src/gallium/auxiliary/util/u_threaded_context_calls.h
> @@ -0,0 +1,66 @@
> +CALL(destroy_query)
> +CALL(begin_query)
> +CALL(end_query)
> +CALL(get_query_result_resource)
> +CALL(render_condition)
> +CALL(bind_sampler_states)
> +CALL(set_framebuffer_state)
> +CALL(set_tess_state)
> +CALL(set_constant_buffer)
> +CALL(set_scissor_states)
> +CALL(set_viewport_states)
> +CALL(set_window_rectangles)
> +CALL(set_sampler_views)
> +CALL(set_shader_images)
> +CALL(set_shader_buffers)
> +CALL(set_vertex_buffers)
> +CALL(set_stream_output_targets)
> +CALL(replace_buffer_storage)
> +CALL(transfer_flush_region)
> +CALL(transfer_unmap)
> +CALL(buffer_subdata)
> +CALL(texture_subdata)
> +CALL(emit_string_marker)
> +CALL(draw_vbo)
> +CALL(launch_grid)
> +CALL(resource_copy_region)
> +CALL(blit)
> +CALL(generate_mipmap)
> +CALL(flush_resource)
> +CALL(invalidate_resource)
> +CALL(clear)
> +CALL(clear_buffer)
> +CALL(clear_texture)
> +CALL(resource_commit)
> +CALL(set_active_query_state)
> +CALL(set_blend_color)
> +CALL(set_stencil_ref)
> +CALL(set_clip_state)
> +CALL(set_sample_mask)
> +CALL(set_min_samples)
> +CALL(set_polygon_stipple)
> +CALL(texture_barrier)
> +CALL(memory_barrier)
> +
> +CALL(bind_blend_state)
> +CALL(bind_rasterizer_state)
> +CALL(bind_depth_stencil_alpha_state)
> +CALL(bind_compute_state)
> +CALL(bind_fs_state)
> +CALL(bind_vs_state)
> +CALL(bind_gs_state)
> +CALL(bind_tcs_state)
> +CALL(bind_tes_state)
> +CALL(bind_vertex_elements_state)
> +
> +CALL(delete_blend_state)
> +CALL(delete_rasterizer_state)
> +CALL(delete_depth_stencil_alpha_state)
> +CALL(delete_compute_state)
> +CALL(delete_fs_state)
> +CALL(delete_vs_state)
> +CALL(delete_gs_state)
> +CALL(delete_tcs_state)
> +CALL(delete_tes_state)
> +CALL(delete_vertex_elements_state)
> +CALL(delete_sampler_state)
>
--
Lerne, wie die Welt wirklich ist,
Aber vergiss niemals, wie sie sein sollte.
More information about the mesa-dev
mailing list