Mesa (main): util: add a util_bitcount variant that selects POPCNT through C++ template arg

GitLab Mirror gitlab-mirror at kemper.freedesktop.org
Wed Nov 3 23:50:04 UTC 2021


Module: Mesa
Branch: main
Commit: 81d35c8d48508e1d28724755af28a6c7572516e8
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=81d35c8d48508e1d28724755af28a6c7572516e8

Author: Marek Olšák <marek.olsak at amd.com>
Date:   Sat Oct 23 23:23:15 2021 -0400

util: add a util_bitcount variant that selects POPCNT through C++ template arg

Moved from radeonsi. st/mesa will use it.

Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer at amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13512>

---

 src/gallium/drivers/radeonsi/si_state_draw.cpp | 29 ++++++--------------------
 src/util/bitscan.h                             | 24 ++++++++++++++++++++-
 2 files changed, 29 insertions(+), 24 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_state_draw.cpp b/src/gallium/drivers/radeonsi/si_state_draw.cpp
index 62efba1cc70..ecaa6cec016 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.cpp
+++ b/src/gallium/drivers/radeonsi/si_state_draw.cpp
@@ -1728,41 +1728,24 @@ void si_set_vertex_buffer_descriptor(struct si_screen *sscreen, struct si_vertex
 
 #endif
 
-/* util_bitcount has large measurable overhead (~2% difference in viewperf),  so we use
- * the POPCNT x86 instruction via inline assembly if the CPU supports it.
- */
-enum si_has_popcnt {
-   POPCNT_NO,
-   POPCNT_YES,
-};
-
-template<si_has_popcnt POPCNT>
-unsigned bitcount_asm(unsigned n)
-{
-   if (POPCNT == POPCNT_YES)
-      return util_popcnt_inline_asm(n);
-   else
-      return util_bitcount(n);
-}
-
-template<si_has_popcnt POPCNT>
+template<util_popcnt POPCNT>
 static ALWAYS_INLINE unsigned get_next_vertex_state_elem(struct pipe_vertex_state *state,
                                                          uint32_t *partial_velem_mask)
 {
    unsigned semantic_index = u_bit_scan(partial_velem_mask);
    assert(state->input.full_velem_mask & BITFIELD_BIT(semantic_index));
    /* A prefix mask of the full mask gives us the index in pipe_vertex_state. */
-   return bitcount_asm<POPCNT>(state->input.full_velem_mask & BITFIELD_MASK(semantic_index));
+   return util_bitcount_fast<POPCNT>(state->input.full_velem_mask & BITFIELD_MASK(semantic_index));
 }
 
 template <chip_class GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS, si_has_ngg NGG,
-          si_is_draw_vertex_state IS_DRAW_VERTEX_STATE, si_has_popcnt POPCNT> ALWAYS_INLINE
+          si_is_draw_vertex_state IS_DRAW_VERTEX_STATE, util_popcnt POPCNT> ALWAYS_INLINE
 static bool si_upload_and_prefetch_VB_descriptors(struct si_context *sctx,
                                                   struct pipe_vertex_state *state,
                                                   uint32_t partial_velem_mask)
 {
    struct si_vertex_state *vstate = (struct si_vertex_state *)state;
-   unsigned count = IS_DRAW_VERTEX_STATE ? bitcount_asm<POPCNT>(partial_velem_mask) :
+   unsigned count = IS_DRAW_VERTEX_STATE ? util_bitcount_fast<POPCNT>(partial_velem_mask) :
                                            sctx->num_vertex_elements;
    unsigned sh_base = si_get_user_data_base(GFX_VERSION, HAS_TESS, HAS_GS, NGG,
                                             PIPE_SHADER_VERTEX);
@@ -2031,7 +2014,7 @@ static void si_emit_all_states(struct si_context *sctx, const struct pipe_draw_i
    } while (0)
 
 template <chip_class GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS, si_has_ngg NGG,
-          si_is_draw_vertex_state IS_DRAW_VERTEX_STATE, si_has_popcnt POPCNT> ALWAYS_INLINE
+          si_is_draw_vertex_state IS_DRAW_VERTEX_STATE, util_popcnt POPCNT> ALWAYS_INLINE
 static void si_draw(struct pipe_context *ctx,
                     const struct pipe_draw_info *info,
                     unsigned drawid_offset,
@@ -2501,7 +2484,7 @@ static void si_draw_vbo(struct pipe_context *ctx,
 }
 
 template <chip_class GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS, si_has_ngg NGG,
-          si_has_popcnt POPCNT>
+          util_popcnt POPCNT>
 static void si_draw_vertex_state(struct pipe_context *ctx,
                                  struct pipe_vertex_state *vstate,
                                  uint32_t partial_velem_mask,
diff --git a/src/util/bitscan.h b/src/util/bitscan.h
index 105b7ba3122..82b1bb5a1dd 100644
--- a/src/util/bitscan.h
+++ b/src/util/bitscan.h
@@ -351,6 +351,28 @@ util_bitcount64(uint64_t n)
 
 #ifdef __cplusplus
 }
-#endif
+
+/* util_bitcount has large measurable overhead (~2%), so it's recommended to
+ * use the POPCNT instruction via inline assembly if the CPU supports it.
+ */
+enum util_popcnt {
+   POPCNT_NO,
+   POPCNT_YES,
+};
+
+/* Convenient function to select popcnt through a C++ template argument.
+ * This should be used as part of larger functions that are optimized
+ * as a whole.
+ */
+template<util_popcnt POPCNT> inline unsigned
+util_bitcount_fast(unsigned n)
+{
+   if (POPCNT == POPCNT_YES)
+      return util_popcnt_inline_asm(n);
+   else
+      return util_bitcount(n);
+}
+
+#endif /* __cplusplus */
 
 #endif /* BITSCAN_H */



More information about the mesa-commit mailing list