Mesa (main): st/mesa: use POPCNT in st_update_array if the CPU supports it

Wed Nov 3 23:50:04 UTC 2021

Module: Mesa
Branch: main
Commit: d24539b15256ac255ffb75f199ffda962b39bad5
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=d24539b15256ac255ffb75f199ffda962b39bad5

Author: Marek Olšák <marek.olsak at amd.com>
Date:   Sat Oct 23 23:25:00 2021 -0400

st/mesa: use POPCNT in st_update_array if the CPU supports it

The st_update_array overhead decreases from 8.28% to 7.67% for a viewperf
subtest.

Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer at amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13512>

---

 src/mesa/state_tracker/st_atom.c         | 16 +++++++--
 src/mesa/state_tracker/st_atom.h         |  4 +++
 src/mesa/state_tracker/st_atom_array.cpp | 57 +++++++++++++++++++++-----------
 src/mesa/state_tracker/st_context.c      |  4 +--
 4 files changed, 56 insertions(+), 25 deletions(-)

diff --git a/src/mesa/state_tracker/st_atom.c b/src/mesa/state_tracker/st_atom.c
index de9369e6e3a..df2c5895fe7 100644
--- a/src/mesa/state_tracker/st_atom.c
+++ b/src/mesa/state_tracker/st_atom.c
@@ -38,21 +38,31 @@
 #include "st_manager.h"
 #include "st_util.h"
 
+#include "util/u_cpu_detect.h"
+
 
 typedef void (*update_func_t)(struct st_context *st);
 
 /* The list state update functions. */
-static const update_func_t update_functions[] =
+static update_func_t update_functions[ST_NUM_ATOMS];
+
+static void
+init_atoms_once(void)
 {
-#define ST_STATE(FLAG, st_update) st_update,
+#define ST_STATE(FLAG, st_update) update_functions[FLAG##_INDEX] = st_update;
 #include "st_atom_list.h"
 #undef ST_STATE
-};
 
+   if (util_get_cpu_caps()->has_popcnt)
+      update_functions[ST_NEW_VERTEX_ARRAYS_INDEX] = st_update_array_with_popcnt;
+}
 
 void st_init_atoms( struct st_context *st )
 {
    STATIC_ASSERT(ARRAY_SIZE(update_functions) <= 64);
+
+   static once_flag flag = ONCE_FLAG_INIT;
+   call_once(&flag, init_atoms_once);
 }
 
 
diff --git a/src/mesa/state_tracker/st_atom.h b/src/mesa/state_tracker/st_atom.h
index 0c53a229a1a..df2d68318f1 100644
--- a/src/mesa/state_tracker/st_atom.h
+++ b/src/mesa/state_tracker/st_atom.h
@@ -79,6 +79,9 @@ st_setup_current_user(struct st_context *st,
                       struct cso_velems_state *velements,
                       struct pipe_vertex_buffer *vbuffer, unsigned *num_vbuffers);
 
+void
+st_update_array_with_popcnt(struct st_context *st);
+
 struct pipe_vertex_state *
 st_create_gallium_vertex_state(struct gl_context *ctx,
                                const struct gl_vertex_array_object *vao,
@@ -90,6 +93,7 @@ enum {
 #define ST_STATE(FLAG, st_update) FLAG##_INDEX,
 #include "st_atom_list.h"
 #undef ST_STATE
+   ST_NUM_ATOMS,
 };
 
 /* Define ST_NEW_xxx values as static const uint64_t values.
diff --git a/src/mesa/state_tracker/st_atom_array.cpp b/src/mesa/state_tracker/st_atom_array.cpp
index 3a8a991a7aa..59a38931a05 100644
--- a/src/mesa/state_tracker/st_atom_array.cpp
+++ b/src/mesa/state_tracker/st_atom_array.cpp
@@ -70,7 +70,7 @@ init_velement(struct pipe_vertex_element *velements,
 /* ALWAYS_INLINE helps the compiler realize that most of the parameters are
  * on the stack.
  */
-static void ALWAYS_INLINE
+template<util_popcnt POPCNT> static void ALWAYS_INLINE
 setup_arrays(struct st_context *st,
              const struct gl_vertex_array_object *vao,
              const GLbitfield dual_slot_inputs,
@@ -119,7 +119,7 @@ setup_arrays(struct st_context *st,
          init_velement(velements->velems, &attrib->Format, 0,
                        binding->InstanceDivisor, bufidx,
                        dual_slot_inputs & BITFIELD_BIT(attr),
-                       util_bitcount(inputs_read & BITFIELD_MASK(attr)));
+                       util_bitcount_fast<POPCNT>(inputs_read & BITFIELD_MASK(attr)));
       }
       return;
    }
@@ -161,11 +161,12 @@ setup_arrays(struct st_context *st,
          init_velement(velements->velems, &attrib->Format, off,
                        binding->InstanceDivisor, bufidx,
                        dual_slot_inputs & BITFIELD_BIT(attr),
-                       util_bitcount(inputs_read & BITFIELD_MASK(attr)));
+                       util_bitcount_fast<POPCNT>(inputs_read & BITFIELD_MASK(attr)));
       } while (attrmask);
    }
 }
 
+/* Only used by the select/feedback mode. */
 void
 st_setup_arrays(struct st_context *st,
                 const struct st_vertex_program *vp,
@@ -176,11 +177,11 @@ st_setup_arrays(struct st_context *st,
 {
    struct gl_context *ctx = st->ctx;
 
-   setup_arrays(st, ctx->Array._DrawVAO, vp->Base.Base.DualSlotInputs,
-                vp_variant->vert_attrib_mask,
-                _mesa_draw_nonzero_divisor_bits(ctx),
-                _mesa_draw_array_bits(ctx), _mesa_draw_user_array_bits(ctx),
-                velements, vbuffer, num_vbuffers, has_user_vertex_buffers);
+   setup_arrays<POPCNT_NO>(st, ctx->Array._DrawVAO, vp->Base.Base.DualSlotInputs,
+                           vp_variant->vert_attrib_mask,
+                           _mesa_draw_nonzero_divisor_bits(ctx),
+                           _mesa_draw_array_bits(ctx), _mesa_draw_user_array_bits(ctx),
+                           velements, vbuffer, num_vbuffers, has_user_vertex_buffers);
 }
 
 /* ALWAYS_INLINE helps the compiler realize that most of the parameters are
@@ -189,7 +190,7 @@ st_setup_arrays(struct st_context *st,
  * Return the index of the vertex buffer where current attribs have been
  * uploaded.
  */
-static void ALWAYS_INLINE
+template<util_popcnt POPCNT> static void ALWAYS_INLINE
 st_setup_current(struct st_context *st,
                  const struct st_vertex_program *vp,
                  const struct st_common_variant *vp_variant,
@@ -222,7 +223,7 @@ st_setup_current(struct st_context *st,
 
          init_velement(velements->velems, &attrib->Format, cursor - data,
                        0, bufidx, dual_slot_inputs & BITFIELD_BIT(attr),
-                       util_bitcount(inputs_read & BITFIELD_MASK(attr)));
+                       util_bitcount_fast<POPCNT>(inputs_read & BITFIELD_MASK(attr)));
 
          cursor += alignment;
       } while (curmask);
@@ -250,6 +251,7 @@ st_setup_current(struct st_context *st,
    }
 }
 
+/* Only used by the select/feedback mode. */
 void
 st_setup_current_user(struct st_context *st,
                       const struct st_vertex_program *vp,
@@ -281,8 +283,8 @@ st_setup_current_user(struct st_context *st,
    }
 }
 
-void
-st_update_array(struct st_context *st)
+template<util_popcnt POPCNT> inline void
+st_update_array_templ(struct st_context *st)
 {
    struct gl_context *ctx = st->ctx;
    /* vertex program validation must be done before this */
@@ -297,15 +299,17 @@ st_update_array(struct st_context *st)
 
    /* ST_NEW_VERTEX_ARRAYS alias ctx->DriverFlags.NewArray */
    /* Setup arrays */
-   setup_arrays(st, ctx->Array._DrawVAO, vp->Base.Base.DualSlotInputs,
-                vp_variant->vert_attrib_mask,
-                _mesa_draw_nonzero_divisor_bits(ctx),
-                _mesa_draw_array_bits(ctx), _mesa_draw_user_array_bits(ctx),
-                &velements, vbuffer, &num_vbuffers, &uses_user_vertex_buffers);
+   setup_arrays<POPCNT>(st, ctx->Array._DrawVAO, vp->Base.Base.DualSlotInputs,
+                        vp_variant->vert_attrib_mask,
+                        _mesa_draw_nonzero_divisor_bits(ctx),
+                        _mesa_draw_array_bits(ctx),
+                        _mesa_draw_user_array_bits(ctx), &velements, vbuffer,
+                        &num_vbuffers, &uses_user_vertex_buffers);
 
    /* _NEW_CURRENT_ATTRIB */
    /* Setup zero-stride attribs. */
-   st_setup_current(st, vp, vp_variant, &velements, vbuffer, &num_vbuffers);
+   st_setup_current<POPCNT>(st, vp, vp_variant, &velements, vbuffer,
+                            &num_vbuffers);
 
    velements.count = vp->num_inputs + vp_variant->key.passthrough_edgeflags;
 
@@ -323,6 +327,18 @@ st_update_array(struct st_context *st)
    st->last_num_vbuffers = num_vbuffers;
 }
 
+void
+st_update_array(struct st_context *st)
+{
+   st_update_array_templ<POPCNT_NO>(st);
+}
+
+void
+st_update_array_with_popcnt(struct st_context *st)
+{
+   st_update_array_templ<POPCNT_YES>(st);
+}
+
 struct pipe_vertex_state *
 st_create_gallium_vertex_state(struct gl_context *ctx,
                                const struct gl_vertex_array_object *vao,
@@ -337,8 +353,9 @@ st_create_gallium_vertex_state(struct gl_context *ctx,
    struct cso_velems_state velements;
    bool uses_user_vertex_buffers;
 
-   setup_arrays(st, vao, dual_slot_inputs, inputs_read, 0, inputs_read, 0,
-                &velements, vbuffer, &num_vbuffers, &uses_user_vertex_buffers);
+   setup_arrays<POPCNT_NO>(st, vao, dual_slot_inputs, inputs_read, 0,
+                           inputs_read, 0, &velements, vbuffer, &num_vbuffers,
+                           &uses_user_vertex_buffers);
 
    if (num_vbuffers != 1 || uses_user_vertex_buffers) {
       assert(!"this should never happen with display lists");
diff --git a/src/mesa/state_tracker/st_context.c b/src/mesa/state_tracker/st_context.c
index aac1bd6ea0c..ab2cc9642f9 100644
--- a/src/mesa/state_tracker/st_context.c
+++ b/src/mesa/state_tracker/st_context.c
@@ -584,6 +584,8 @@ st_create_context_priv(struct gl_context *ctx, struct pipe_context *pipe,
    uint i;
    struct st_context *st = ST_CALLOC_STRUCT( st_context);
 
+   util_cpu_detect();
+
    st->options = *options;
 
    ctx->st = st;
@@ -842,8 +844,6 @@ st_create_context_priv(struct gl_context *ctx, struct pipe_context *pipe,
          !st->lower_ucp;
    st->shader_has_one_variant[MESA_SHADER_COMPUTE] = st->has_shareable_shaders;
 
-   util_cpu_detect();
-
    if (util_get_cpu_caps()->num_L3_caches == 1 ||
        !st->pipe->set_context_param)
       st->pin_thread_counter = ST_L3_PINNING_DISABLED;