[Mesa-dev] [PATCH 6/8] st/mesa: pin driver threads to a specific L3 cache on AMD Zen CPUs
Marek Olšák
maraeo at gmail.com
Thu Sep 6 04:02:27 UTC 2018
From: Marek Olšák <marek.olsak at amd.com>
---
src/gallium/auxiliary/util/u_helpers.c | 40 ++++++++++++++++++
src/gallium/auxiliary/util/u_helpers.h | 4 ++
src/mesa/state_tracker/st_context.c | 3 ++
src/mesa/state_tracker/st_manager.c | 9 ++++
src/util/u_thread.h | 57 ++++++++++++++++++++++++++
5 files changed, 113 insertions(+)
diff --git a/src/gallium/auxiliary/util/u_helpers.c b/src/gallium/auxiliary/util/u_helpers.c
index 25d8fbce6f7..8374940fc41 100644
--- a/src/gallium/auxiliary/util/u_helpers.c
+++ b/src/gallium/auxiliary/util/u_helpers.c
@@ -18,23 +18,25 @@
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS AND/OR THEIR SUPPLIERS BE LIABLE FOR
* ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
**************************************************************************/
+#include "util/u_cpu_detect.h"
#include "util/u_helpers.h"
#include "util/u_inlines.h"
#include "util/u_upload_mgr.h"
+#include "util/u_thread.h"
#include <inttypes.h>
/**
* This function is used to copy an array of pipe_vertex_buffer structures,
* while properly referencing the pipe_vertex_buffer::buffer member.
*
* enabled_buffers is updated such that the bits corresponding to the indices
* of disabled buffers are set to 0 and the enabled ones are set to 1.
*
* \sa util_copy_framebuffer_state
@@ -111,20 +113,58 @@ util_upload_index_buffer(struct pipe_context *pipe,
u_upload_data(pipe->stream_uploader, start_offset,
info->count * info->index_size, 4,
(char*)info->index.user + start_offset,
out_offset, out_buffer);
u_upload_unmap(pipe->stream_uploader);
*out_offset -= start_offset;
return *out_buffer != NULL;
}
+/**
+ * Called by MakeCurrent. Used to notify the driver that the application
+ * thread may have been changed.
+ *
+ * The function pins the current thread and driver threads to a group of
+ * CPU cores that share the same L3 cache. This is needed for good multi-
+ * threading performance on AMD Zen CPUs.
+ *
+ * \param upper_thread thread in the state tracker that also needs to be
+ * pinned.
+ */
+void
+util_context_thread_changed(struct pipe_context *ctx, thrd_t *upper_thread)
+{
+ thrd_t current = thrd_current();
+ int cache = util_get_L3_for_pinned_thread(current,
+ util_cpu_caps.cores_per_L3);
+
+ /* If the main thread is not pinned, choose the L3 cache. */
+ if (cache == -1) {
+ unsigned num_caches = util_cpu_caps.nr_cpus /
+ util_cpu_caps.cores_per_L3;
+ static unsigned last_cache;
+
+ /* Choose a different L3 cache for each subsequent MakeCurrent. */
+ cache = p_atomic_inc_return(&last_cache) % num_caches;
+ util_pin_thread_to_L3(current, cache, util_cpu_caps.cores_per_L3);
+ }
+
+ /* Tell the driver to pin its threads to the same L3 cache. */
+ if (ctx->pin_threads_to_L3_cache)
+ ctx->pin_threads_to_L3_cache(ctx, cache);
+
+ /* Do the same for the upper level thread if there is any (e.g. glthread) */
+ if (upper_thread)
+ util_pin_thread_to_L3(*upper_thread, cache, util_cpu_caps.cores_per_L3);
+}
+
/* This is a helper for hardware bring-up. Don't remove. */
struct pipe_query *
util_begin_pipestat_query(struct pipe_context *ctx)
{
struct pipe_query *q =
ctx->create_query(ctx, PIPE_QUERY_PIPELINE_STATISTICS, 0);
if (!q)
return NULL;
ctx->begin_query(ctx, q);
diff --git a/src/gallium/auxiliary/util/u_helpers.h b/src/gallium/auxiliary/util/u_helpers.h
index e65e64d7781..38c47c1cc98 100644
--- a/src/gallium/auxiliary/util/u_helpers.h
+++ b/src/gallium/auxiliary/util/u_helpers.h
@@ -22,20 +22,21 @@
* ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
**************************************************************************/
#ifndef U_HELPERS_H
#define U_HELPERS_H
#include "pipe/p_state.h"
+#include "c11/threads.h"
#include <stdio.h>
#ifdef __cplusplus
extern "C" {
#endif
void util_set_vertex_buffers_mask(struct pipe_vertex_buffer *dst,
uint32_t *enabled_buffers,
const struct pipe_vertex_buffer *src,
unsigned start_slot, unsigned count);
@@ -43,20 +44,23 @@ void util_set_vertex_buffers_mask(struct pipe_vertex_buffer *dst,
void util_set_vertex_buffers_count(struct pipe_vertex_buffer *dst,
unsigned *dst_count,
const struct pipe_vertex_buffer *src,
unsigned start_slot, unsigned count);
bool util_upload_index_buffer(struct pipe_context *pipe,
const struct pipe_draw_info *info,
struct pipe_resource **out_buffer,
unsigned *out_offset);
+void
+util_context_thread_changed(struct pipe_context *ctx, thrd_t *upper_thread);
+
struct pipe_query *
util_begin_pipestat_query(struct pipe_context *ctx);
void
util_end_pipestat_query(struct pipe_context *ctx, struct pipe_query *q,
FILE *f);
void
util_wait_for_idle(struct pipe_context *ctx);
diff --git a/src/mesa/state_tracker/st_context.c b/src/mesa/state_tracker/st_context.c
index edcbd36a1bf..354876746f4 100644
--- a/src/mesa/state_tracker/st_context.c
+++ b/src/mesa/state_tracker/st_context.c
@@ -72,20 +72,21 @@
#include "st_draw.h"
#include "st_extensions.h"
#include "st_gen_mipmap.h"
#include "st_pbo.h"
#include "st_program.h"
#include "st_sampler_view.h"
#include "st_shader_cache.h"
#include "st_vdpau.h"
#include "st_texture.h"
#include "pipe/p_context.h"
+#include "util/u_cpu_detect.h"
#include "util/u_inlines.h"
#include "util/u_upload_mgr.h"
#include "util/u_vbuf.h"
#include "cso_cache/cso_context.h"
DEBUG_GET_ONCE_BOOL_OPTION(mesa_mvp_dp4, "MESA_MVP_DP4", FALSE)
/**
@@ -561,20 +562,22 @@ st_create_context(gl_api api, struct pipe_context *pipe,
const struct gl_config *visual,
struct st_context *share,
const struct st_config_options *options,
bool no_error)
{
struct gl_context *ctx;
struct gl_context *shareCtx = share ? share->ctx : NULL;
struct dd_function_table funcs;
struct st_context *st;
+ util_cpu_detect();
+
memset(&funcs, 0, sizeof(funcs));
st_init_driver_functions(pipe->screen, &funcs);
ctx = calloc(1, sizeof(struct gl_context));
if (!ctx)
return NULL;
if (!_mesa_initialize_context(ctx, api, visual, shareCtx, &funcs)) {
free(ctx);
return NULL;
diff --git a/src/mesa/state_tracker/st_manager.c b/src/mesa/state_tracker/st_manager.c
index 69286b57916..7a37f9850f8 100644
--- a/src/mesa/state_tracker/st_manager.c
+++ b/src/mesa/state_tracker/st_manager.c
@@ -1056,20 +1056,29 @@ st_api_make_current(struct st_api *stapi, struct st_context_iface *stctxi,
ret = _mesa_make_current(st->ctx, incomplete, incomplete);
}
st_framebuffer_reference(&stdraw, NULL);
st_framebuffer_reference(&stread, NULL);
/* Purge the context's winsys_buffers list in case any
* of the referenced drawables no longer exist.
*/
st_framebuffers_purge(st);
+
+ /* Notify the driver that the context thread may have been changed.
+ * This should pin all driver threads to a specific L3 cache for optimal
+ * performance on AMD Zen CPUs.
+ */
+ struct glthread_state *glthread = st->ctx->GLThread;
+ thrd_t *upper_thread = glthread ? &glthread->queue.threads[0] : NULL;
+
+ util_context_thread_changed(st->pipe, upper_thread);
}
else {
ret = _mesa_make_current(NULL, NULL, NULL);
}
return ret;
}
static void
diff --git a/src/util/u_thread.h b/src/util/u_thread.h
index 8c6e0bdc59e..0555ba61111 100644
--- a/src/util/u_thread.h
+++ b/src/util/u_thread.h
@@ -63,20 +63,77 @@ static inline void u_thread_setname( const char *name )
#if defined(HAVE_PTHREAD)
# if defined(__GNU_LIBRARY__) && defined(__GLIBC__) && defined(__GLIBC_MINOR__) && \
(__GLIBC__ >= 3 || (__GLIBC__ == 2 && __GLIBC_MINOR__ >= 12)) && \
defined(__linux__)
pthread_setname_np(pthread_self(), name);
# endif
#endif
(void)name;
}
+/**
+ * An AMD Zen CPU consists of multiple modules where each module has its own L3
+ * cache. Inter-thread communication such as locks and atomics between modules
+ * is very expensive. It's desirable to pin a group of closely cooperating
+ * threads to one group of cores sharing L3.
+ *
+ * \param thread thread
+ * \param L3_index index of the L3 cache
+ * \param cores_per_L3 number of CPU cores shared by one L3
+ */
+static inline void
+util_pin_thread_to_L3(thrd_t thread, unsigned L3_index, unsigned cores_per_L3)
+{
+#if defined(HAVE_PTHREAD)
+ cpu_set_t cpuset;
+
+ CPU_ZERO(&cpuset);
+ for (unsigned i = 0; i < cores_per_L3; i++)
+ CPU_SET(L3_index * cores_per_L3 + i, &cpuset);
+ pthread_setaffinity_np(thread, sizeof(cpuset), &cpuset);
+#endif
+}
+
+/**
+ * Return the index of L3 that the thread is pinned to. If the thread is
+ * pinned to multiple L3 caches, return -1.
+ *
+ * \param thread thread
+ * \param cores_per_L3 number of CPU cores shared by one L3
+ */
+static inline int
+util_get_L3_for_pinned_thread(thrd_t thread, unsigned cores_per_L3)
+{
+#if defined(HAVE_PTHREAD)
+ cpu_set_t cpuset;
+
+ if (pthread_getaffinity_np(thread, sizeof(cpuset), &cpuset) == 0) {
+ int L3_index = -1;
+
+ for (unsigned i = 0; i < CPU_SETSIZE; i++) {
+ if (CPU_ISSET(i, &cpuset)) {
+ int x = i / cores_per_L3;
+
+ if (L3_index != x) {
+ if (L3_index == -1)
+ L3_index = x;
+ else
+ return -1; /* multiple L3s are set */
+ }
+ }
+ }
+ return L3_index;
+ }
+#endif
+ return -1;
+}
+
/*
* Thread statistics.
*/
/* Return the time of a thread's CPU time clock. */
static inline int64_t
u_thread_get_time_nano(thrd_t thread)
{
#if defined(__linux__) && defined(HAVE_PTHREAD)
struct timespec ts;
--
2.17.1
More information about the mesa-dev
mailing list