Mesa (main): util: add a util_bitcount variant that selects POPCNT through C++ template arg
GitLab Mirror
gitlab-mirror at kemper.freedesktop.org
Wed Nov 3 23:50:04 UTC 2021
Module: Mesa
Branch: main
Commit: 81d35c8d48508e1d28724755af28a6c7572516e8
URL: http://cgit.freedesktop.org/mesa/mesa/commit/?id=81d35c8d48508e1d28724755af28a6c7572516e8
Author: Marek Olšák <marek.olsak at amd.com>
Date: Sat Oct 23 23:23:15 2021 -0400
util: add a util_bitcount variant that selects POPCNT through C++ template arg
Moved from radeonsi. st/mesa will use it.
Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer at amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13512>
---
src/gallium/drivers/radeonsi/si_state_draw.cpp | 29 ++++++--------------------
src/util/bitscan.h | 24 ++++++++++++++++++++-
2 files changed, 29 insertions(+), 24 deletions(-)
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.cpp b/src/gallium/drivers/radeonsi/si_state_draw.cpp
index 62efba1cc70..ecaa6cec016 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.cpp
+++ b/src/gallium/drivers/radeonsi/si_state_draw.cpp
@@ -1728,41 +1728,24 @@ void si_set_vertex_buffer_descriptor(struct si_screen *sscreen, struct si_vertex
#endif
-/* util_bitcount has large measurable overhead (~2% difference in viewperf), so we use
- * the POPCNT x86 instruction via inline assembly if the CPU supports it.
- */
-enum si_has_popcnt {
- POPCNT_NO,
- POPCNT_YES,
-};
-
-template<si_has_popcnt POPCNT>
-unsigned bitcount_asm(unsigned n)
-{
- if (POPCNT == POPCNT_YES)
- return util_popcnt_inline_asm(n);
- else
- return util_bitcount(n);
-}
-
-template<si_has_popcnt POPCNT>
+template<util_popcnt POPCNT>
static ALWAYS_INLINE unsigned get_next_vertex_state_elem(struct pipe_vertex_state *state,
uint32_t *partial_velem_mask)
{
unsigned semantic_index = u_bit_scan(partial_velem_mask);
assert(state->input.full_velem_mask & BITFIELD_BIT(semantic_index));
/* A prefix mask of the full mask gives us the index in pipe_vertex_state. */
- return bitcount_asm<POPCNT>(state->input.full_velem_mask & BITFIELD_MASK(semantic_index));
+ return util_bitcount_fast<POPCNT>(state->input.full_velem_mask & BITFIELD_MASK(semantic_index));
}
template <chip_class GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS, si_has_ngg NGG,
- si_is_draw_vertex_state IS_DRAW_VERTEX_STATE, si_has_popcnt POPCNT> ALWAYS_INLINE
+ si_is_draw_vertex_state IS_DRAW_VERTEX_STATE, util_popcnt POPCNT> ALWAYS_INLINE
static bool si_upload_and_prefetch_VB_descriptors(struct si_context *sctx,
struct pipe_vertex_state *state,
uint32_t partial_velem_mask)
{
struct si_vertex_state *vstate = (struct si_vertex_state *)state;
- unsigned count = IS_DRAW_VERTEX_STATE ? bitcount_asm<POPCNT>(partial_velem_mask) :
+ unsigned count = IS_DRAW_VERTEX_STATE ? util_bitcount_fast<POPCNT>(partial_velem_mask) :
sctx->num_vertex_elements;
unsigned sh_base = si_get_user_data_base(GFX_VERSION, HAS_TESS, HAS_GS, NGG,
PIPE_SHADER_VERTEX);
@@ -2031,7 +2014,7 @@ static void si_emit_all_states(struct si_context *sctx, const struct pipe_draw_i
} while (0)
template <chip_class GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS, si_has_ngg NGG,
- si_is_draw_vertex_state IS_DRAW_VERTEX_STATE, si_has_popcnt POPCNT> ALWAYS_INLINE
+ si_is_draw_vertex_state IS_DRAW_VERTEX_STATE, util_popcnt POPCNT> ALWAYS_INLINE
static void si_draw(struct pipe_context *ctx,
const struct pipe_draw_info *info,
unsigned drawid_offset,
@@ -2501,7 +2484,7 @@ static void si_draw_vbo(struct pipe_context *ctx,
}
template <chip_class GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS, si_has_ngg NGG,
- si_has_popcnt POPCNT>
+ util_popcnt POPCNT>
static void si_draw_vertex_state(struct pipe_context *ctx,
struct pipe_vertex_state *vstate,
uint32_t partial_velem_mask,
diff --git a/src/util/bitscan.h b/src/util/bitscan.h
index 105b7ba3122..82b1bb5a1dd 100644
--- a/src/util/bitscan.h
+++ b/src/util/bitscan.h
@@ -351,6 +351,28 @@ util_bitcount64(uint64_t n)
#ifdef __cplusplus
}
-#endif
+
+/* util_bitcount has large measurable overhead (~2%), so it's recommended to
+ * use the POPCNT instruction via inline assembly if the CPU supports it.
+ */
+enum util_popcnt {
+ POPCNT_NO,
+ POPCNT_YES,
+};
+
+/* Convenient function to select popcnt through a C++ template argument.
+ * This should be used as part of larger functions that are optimized
+ * as a whole.
+ */
+template<util_popcnt POPCNT> inline unsigned
+util_bitcount_fast(unsigned n)
+{
+ if (POPCNT == POPCNT_YES)
+ return util_popcnt_inline_asm(n);
+ else
+ return util_bitcount(n);
+}
+
+#endif /* __cplusplus */
#endif /* BITSCAN_H */
More information about the mesa-commit
mailing list