[Mesa-dev] [PATCH 1/2] gallium/u_threaded: align batches and call slots to 16 bytes

Marek Olšák maraeo at gmail.com
Thu Jun 1 18:09:50 UTC 2017


From: Marek Olšák <marek.olsak at amd.com>

not sure if this helps
---
 src/gallium/auxiliary/util/u_threaded_context.c | 11 +++++++++--
 src/gallium/auxiliary/util/u_threaded_context.h |  9 ++++++++-
 2 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/src/gallium/auxiliary/util/u_threaded_context.c b/src/gallium/auxiliary/util/u_threaded_context.c
index 8ea7f8a..34206bf 100644
--- a/src/gallium/auxiliary/util/u_threaded_context.c
+++ b/src/gallium/auxiliary/util/u_threaded_context.c
@@ -2118,21 +2118,21 @@ tc_destroy(struct pipe_context *_pipe)
 
    if (tc->base.const_uploader &&
        tc->base.stream_uploader != tc->base.const_uploader)
       u_upload_destroy(tc->base.const_uploader);
 
    if (tc->base.stream_uploader)
       u_upload_destroy(tc->base.stream_uploader);
 
    slab_destroy_child(&tc->pool_transfers);
    pipe->destroy(pipe);
-   FREE(tc);
+   os_free_aligned(tc);
 }
 
 static const tc_execute execute_func[TC_NUM_CALLS] = {
 #define CALL(name) tc_call_##name,
 #include "u_threaded_context_calls.h"
 #undef CALL
 };
 
 /**
  * Wrap an existing pipe_context into a threaded_context.
@@ -2158,25 +2158,32 @@ threaded_context_create(struct pipe_context *pipe,
    STATIC_ASSERT(sizeof(struct tc_call) <= 16);
 
    if (!pipe)
       return NULL;
 
    util_cpu_detect();
 
    if (!debug_get_bool_option("GALLIUM_THREAD", util_cpu_caps.nr_cpus > 1))
       return pipe;
 
-   tc = CALLOC_STRUCT(threaded_context);
+   tc = os_malloc_aligned(sizeof(struct threaded_context), 16);
    if (!tc) {
       pipe->destroy(pipe);
       return NULL;
    }
+   memset(tc, 0, sizeof(*tc));
+
+   assert((uintptr_t)tc % 16 == 0);
+   STATIC_ASSERT(offsetof(struct threaded_context, batch_slots[0]) % 16 == 0);
+   STATIC_ASSERT(offsetof(struct threaded_context, batch_slots[0].call[0]) % 16 == 0);
+   STATIC_ASSERT(offsetof(struct threaded_context, batch_slots[0].call[1]) % 16 == 0);
+   STATIC_ASSERT(offsetof(struct threaded_context, batch_slots[1].call[0]) % 16 == 0);
 
    /* The driver context isn't wrapped, so set its "priv" to NULL. */
    pipe->priv = NULL;
 
    tc->pipe = pipe;
    tc->replace_buffer_storage = replace_buffer;
    tc->map_buffer_alignment =
       pipe->screen->get_param(pipe->screen, PIPE_CAP_MIN_MAP_BUFFER_ALIGNMENT);
    tc->base.priv = pipe; /* priv points to the wrapped driver context */
    tc->base.screen = pipe->screen;
diff --git a/src/gallium/auxiliary/util/u_threaded_context.h b/src/gallium/auxiliary/util/u_threaded_context.h
index f139230..5d2a10c 100644
--- a/src/gallium/auxiliary/util/u_threaded_context.h
+++ b/src/gallium/auxiliary/util/u_threaded_context.h
@@ -266,21 +266,28 @@ struct threaded_query {
  * Most calls will typecast this to the type they need, typically larger
  * than 8 bytes.
  */
 union tc_payload {
    struct pipe_query *query;
    struct pipe_resource *resource;
    struct pipe_transfer *transfer;
    uint64_t __use_8_bytes;
 };
 
-struct tc_call {
+#ifdef _MSC_VER
+#define ALIGN16 __declspec(align(16))
+#else
+#define ALIGN16 __attribute__((aligned(16)))
+#endif
+
+/* Each call slot should be aligned to its own size for optimal cache usage. */
+struct ALIGN16 tc_call {
    unsigned sentinel;
    ushort num_call_slots;
    ushort call_id;
    union tc_payload payload;
 };
 
 struct tc_batch {
    struct pipe_context *pipe;
    unsigned sentinel;
    unsigned num_total_call_slots;
-- 
2.7.4



More information about the mesa-dev mailing list