Mesa (master): st/mesa: pin driver threads to a specific L3 cache on AMD Zen (v2)

Fri Sep 7 20:04:16 UTC 2018

Module: Mesa
Branch: master
Commit: 8d473f555a0c3c94cffd18e68a13274488dcfb17
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=8d473f555a0c3c94cffd18e68a13274488dcfb17

Author: Marek Olšák <marek.olsak at amd.com>
Date:   Wed Sep  5 23:10:57 2018 -0400

st/mesa: pin driver threads to a specific L3 cache on AMD Zen (v2)

v2: use set_context_param

Reviewed-by: Brian Paul <brianp at vmware.com>

---

 src/gallium/auxiliary/util/u_helpers.c | 42 +++++++++++++++++++++++++
 src/gallium/auxiliary/util/u_helpers.h |  4 +++
 src/mesa/state_tracker/st_context.c    |  3 ++
 src/mesa/state_tracker/st_manager.c    | 10 ++++++
 src/util/u_thread.h                    | 57 ++++++++++++++++++++++++++++++++++
 5 files changed, 116 insertions(+)

diff --git a/src/gallium/auxiliary/util/u_helpers.c b/src/gallium/auxiliary/util/u_helpers.c
index 365238631a..7d45b2f06d 100644
--- a/src/gallium/auxiliary/util/u_helpers.c
+++ b/src/gallium/auxiliary/util/u_helpers.c
@@ -25,9 +25,11 @@
  *
  **************************************************************************/
 
+#include "util/u_cpu_detect.h"
 #include "util/u_helpers.h"
 #include "util/u_inlines.h"
 #include "util/u_upload_mgr.h"
+#include "util/u_thread.h"
 #include <inttypes.h>
 
 /**
@@ -118,6 +120,46 @@ util_upload_index_buffer(struct pipe_context *pipe,
    return *out_buffer != NULL;
 }
 
+/**
+ * Called by MakeCurrent. Used to notify the driver that the application
+ * thread may have been changed.
+ *
+ * The function pins the current thread and driver threads to a group of
+ * CPU cores that share the same L3 cache. This is needed for good multi-
+ * threading performance on AMD Zen CPUs.
+ *
+ * \param upper_thread  thread in the state tracker that also needs to be
+ *                      pinned.
+ */
+void
+util_context_thread_changed(struct pipe_context *ctx, thrd_t *upper_thread)
+{
+   thrd_t current = thrd_current();
+   int cache = util_get_L3_for_pinned_thread(current,
+                                             util_cpu_caps.cores_per_L3);
+
+   /* If the main thread is not pinned, choose the L3 cache. */
+   if (cache == -1) {
+      unsigned num_caches = util_cpu_caps.nr_cpus /
+                            util_cpu_caps.cores_per_L3;
+      static unsigned last_cache;
+
+      /* Choose a different L3 cache for each subsequent MakeCurrent. */
+      cache = p_atomic_inc_return(&last_cache) % num_caches;
+      util_pin_thread_to_L3(current, cache, util_cpu_caps.cores_per_L3);
+   }
+
+   /* Tell the driver to pin its threads to the same L3 cache. */
+   if (ctx->set_context_param) {
+      ctx->set_context_param(ctx, PIPE_CONTEXT_PARAM_PIN_THREADS_TO_L3_CACHE,
+                             cache);
+   }
+
+   /* Do the same for the upper level thread if there is any (e.g. glthread) */
+   if (upper_thread)
+      util_pin_thread_to_L3(*upper_thread, cache, util_cpu_caps.cores_per_L3);
+}
+
 /* This is a helper for hardware bring-up. Don't remove. */
 struct pipe_query *
 util_begin_pipestat_query(struct pipe_context *ctx)
diff --git a/src/gallium/auxiliary/util/u_helpers.h b/src/gallium/auxiliary/util/u_helpers.h
index 1e57e32959..f00436e3b1 100644
--- a/src/gallium/auxiliary/util/u_helpers.h
+++ b/src/gallium/auxiliary/util/u_helpers.h
@@ -29,6 +29,7 @@
 #define U_HELPERS_H
 
 #include "pipe/p_state.h"
+#include "c11/threads.h"
 #include <stdio.h>
 
 #ifdef __cplusplus
@@ -50,6 +51,9 @@ bool util_upload_index_buffer(struct pipe_context *pipe,
                               struct pipe_resource **out_buffer,
                               unsigned *out_offset);
 
+void
+util_context_thread_changed(struct pipe_context *ctx, thrd_t *upper_thread);
+
 struct pipe_query *
 util_begin_pipestat_query(struct pipe_context *ctx);
 
diff --git a/src/mesa/state_tracker/st_context.c b/src/mesa/state_tracker/st_context.c
index 34bfb845bb..146d9c629a 100644
--- a/src/mesa/state_tracker/st_context.c
+++ b/src/mesa/state_tracker/st_context.c
@@ -79,6 +79,7 @@
 #include "st_vdpau.h"
 #include "st_texture.h"
 #include "pipe/p_context.h"
+#include "util/u_cpu_detect.h"
 #include "util/u_inlines.h"
 #include "util/u_upload_mgr.h"
 #include "util/u_vbuf.h"
@@ -563,6 +564,8 @@ st_create_context(gl_api api, struct pipe_context *pipe,
    struct dd_function_table funcs;
    struct st_context *st;
 
+   util_cpu_detect();
+
    memset(&funcs, 0, sizeof(funcs));
    st_init_driver_functions(pipe->screen, &funcs);
 
diff --git a/src/mesa/state_tracker/st_manager.c b/src/mesa/state_tracker/st_manager.c
index 69286b5791..a3f0022894 100644
--- a/src/mesa/state_tracker/st_manager.c
+++ b/src/mesa/state_tracker/st_manager.c
@@ -56,6 +56,7 @@
 #include "pipe/p_context.h"
 #include "pipe/p_screen.h"
 #include "util/u_format.h"
+#include "util/u_helpers.h"
 #include "util/u_pointer.h"
 #include "util/u_inlines.h"
 #include "util/u_atomic.h"
@@ -1063,6 +1064,15 @@ st_api_make_current(struct st_api *stapi, struct st_context_iface *stctxi,
        * of the referenced drawables no longer exist.
        */
       st_framebuffers_purge(st);
+
+      /* Notify the driver that the context thread may have been changed.
+       * This should pin all driver threads to a specific L3 cache for optimal
+       * performance on AMD Zen CPUs.
+       */
+      struct glthread_state *glthread = st->ctx->GLThread;
+      thrd_t *upper_thread = glthread ? &glthread->queue.threads[0] : NULL;
+
+      util_context_thread_changed(st->pipe, upper_thread);
    }
    else {
       ret = _mesa_make_current(NULL, NULL, NULL);
diff --git a/src/util/u_thread.h b/src/util/u_thread.h
index 8c6e0bdc59..ec0d9a7f36 100644
--- a/src/util/u_thread.h
+++ b/src/util/u_thread.h
@@ -70,6 +70,63 @@ static inline void u_thread_setname( const char *name )
    (void)name;
 }
 
+/**
+ * An AMD Zen CPU consists of multiple modules where each module has its own L3
+ * cache. Inter-thread communication such as locks and atomics between modules
+ * is very expensive. It's desirable to pin a group of closely cooperating
+ * threads to one group of cores sharing L3.
+ *
+ * \param thread        thread
+ * \param L3_index      index of the L3 cache
+ * \param cores_per_L3  number of CPU cores shared by one L3
+ */
+static inline void
+util_pin_thread_to_L3(thrd_t thread, unsigned L3_index, unsigned cores_per_L3)
+{
+#if defined(HAVE_PTHREAD)
+   cpu_set_t cpuset;
+
+   CPU_ZERO(&cpuset);
+   for (unsigned i = 0; i < cores_per_L3; i++)
+      CPU_SET(L3_index * cores_per_L3 + i, &cpuset);
+   pthread_setaffinity_np(thread, sizeof(cpuset), &cpuset);
+#endif
+}
+
+/**
+ * Return the index of L3 that the thread is pinned to. If the thread is
+ * pinned to multiple L3 caches, return -1.
+ *
+ * \param thread        thread
+ * \param cores_per_L3  number of CPU cores shared by one L3
+ */
+static inline int
+util_get_L3_for_pinned_thread(thrd_t thread, unsigned cores_per_L3)
+{
+#if defined(HAVE_PTHREAD)
+   cpu_set_t cpuset;
+
+   if (pthread_getaffinity_np(thread, sizeof(cpuset), &cpuset) == 0) {
+      int L3_index = -1;
+
+      for (unsigned i = 0; i < CPU_SETSIZE; i++) {
+         if (CPU_ISSET(i, &cpuset)) {
+            int x = i / cores_per_L3;
+
+            if (L3_index != x) {
+               if (L3_index == -1)
+                  L3_index = x;
+               else
+                  return -1; /* multiple L3s are set */
+            }
+         }
+      }
+      return L3_index;
+   }
+#endif
+   return -1;
+}
+
 /*
  * Thread statistics.
  */