[Mesa-dev] [PATCH 4/4] r600g: use a bitfield to track dirty atoms
Grazvydas Ignotas
notasas at gmail.com
Mon Aug 10 17:20:00 PDT 2015
On Mon, Aug 10, 2015 at 1:47 PM, Marek Olšák <maraeo at gmail.com> wrote:
> Please never use "long" in Mesa. It only has 32 bits on 32-bit
> systems. uint64_t is generally used for all unsigned 64-bit variables
> and "llu" or "ull" is the number suffix. Also, the 64-bit ctz is ctzll
> and a proper HAVE macro should be added for it too.
Well I intentionally chose long to have a machine-word sized type, for
uint64_t gcc would have to emit multiple instructions for bit setting
and ctzll on 32bit CPUs (actually a library call for ctzll from what I
see), so the idea was to use longer array there instead.
> The general idea is nice, thanks.
>
> The number of atoms can be cut down by merging all scissors states
> into 1 atom (just as there is 1 atom for 16 textures, there can be 1
> atom for 16 scissors) and the same applies to viewport states. This
> would simplify the code, because all dirty bits would fit into 64 bits
> and there would even be some space left.
This sounds good, I'll try to work on it when I can find time.
Also, do you know why the atoms start from 4 for r600, and 0-3 seem to
be unused?
(also CC'ing Jerome, seems to be his code)
Gražvydas
>
> Patches 1-3:
> Reviewed-by: Marek Olšák <marek.olsak at amd.com>
>
> Marek
>
> On Sun, Aug 9, 2015 at 11:42 PM, Grazvydas Ignotas <notasas at gmail.com> wrote:
>> r600 currently has 73 atoms and looping through their dirty flags has
>> become costly because checking each flag requires a pointer
>> dereference before the read. To avoid having to do that add additional
>> bitfield which can be checked really quickly thanks to tzcnt instruction.
>>
>> id field was added to struct r600_atom but that doesn't affect memory
>> usage for both 32 and 64 bit CPUs because it was stuffed into padding.
>>
>> The performance improvement is ~2% for benchmarks that can have FPS in
>> the thousands but is hardly measurable in "real" programs.
>> ---
>> src/gallium/drivers/r600/r600_hw_context.c | 12 +++----
>> src/gallium/drivers/r600/r600_pipe.h | 45 +++++++++++++++++++++++++++
>> src/gallium/drivers/r600/r600_state_common.c | 8 ++---
>> src/gallium/drivers/radeon/r600_pipe_common.h | 1 +
>> 4 files changed, 56 insertions(+), 10 deletions(-)
>>
>> diff --git a/src/gallium/drivers/r600/r600_hw_context.c b/src/gallium/drivers/r600/r600_hw_context.c
>> index c048a71..6445151 100644
>> --- a/src/gallium/drivers/r600/r600_hw_context.c
>> +++ b/src/gallium/drivers/r600/r600_hw_context.c
>> @@ -51,13 +51,13 @@ void r600_need_cs_space(struct r600_context *ctx, unsigned num_dw,
>> unsigned i;
>>
>> /* The number of dwords all the dirty states would take. */
>> - for (i = 0; i < R600_NUM_ATOMS; i++) {
>> - if (ctx->atoms[i] && ctx->atoms[i]->dirty) {
>> - num_dw += ctx->atoms[i]->num_dw;
>> - if (ctx->screen->b.trace_bo) {
>> - num_dw += R600_TRACE_CS_DWORDS;
>> - }
>> + i = r600_next_dirty_atom(ctx, 0);
>> + while (i < R600_NUM_ATOMS) {
>> + num_dw += ctx->atoms[i]->num_dw;
>> + if (ctx->screen->b.trace_bo) {
>> + num_dw += R600_TRACE_CS_DWORDS;
>> }
>> + i = r600_next_dirty_atom(ctx, i + 1);
>> }
>>
>> /* The upper-bound of how much space a draw command would take. */
>> diff --git a/src/gallium/drivers/r600/r600_pipe.h b/src/gallium/drivers/r600/r600_pipe.h
>> index 8b61269..5d10bb4 100644
>> --- a/src/gallium/drivers/r600/r600_pipe.h
>> +++ b/src/gallium/drivers/r600/r600_pipe.h
>> @@ -85,6 +85,9 @@
>> #define R600_BIG_ENDIAN 0
>> #endif
>>
>> +#define R600_DIRTY_ATOM_WORD_BITS (sizeof(unsigned long) * 8)
>> +#define R600_DIRTY_ATOM_ARRAY_LEN DIV_ROUND_UP(R600_NUM_ATOMS, R600_DIRTY_ATOM_WORD_BITS)
>> +
>> struct r600_context;
>> struct r600_bytecode;
>> struct r600_shader_key;
>> @@ -426,6 +429,8 @@ struct r600_context {
>>
>> /* State binding slots are here. */
>> struct r600_atom *atoms[R600_NUM_ATOMS];
>> + /* Dirty atom bitmask for fast tests */
>> + unsigned long dirty_atoms[R600_DIRTY_ATOM_ARRAY_LEN];
>> /* States for CS initialization. */
>> struct r600_command_buffer start_cs_cmd; /* invariant state mostly */
>> /** Compute specific registers initializations. The start_cs_cmd atom
>> @@ -502,7 +507,18 @@ static inline void r600_set_atom_dirty(struct r600_context *rctx,
>> struct r600_atom *atom,
>> bool dirty)
>> {
>> + unsigned long mask;
>> + unsigned int w;
>> +
>> atom->dirty = dirty;
>> +
>> + assert(atom->id != 0);
>> + w = atom->id / R600_DIRTY_ATOM_WORD_BITS;
>> + mask = 1ul << (atom->id % R600_DIRTY_ATOM_WORD_BITS);
>> + if (dirty)
>> + rctx->dirty_atoms[w] |= mask;
>> + else
>> + rctx->dirty_atoms[w] &= ~mask;
>> }
>>
>> static inline void r600_mark_atom_dirty(struct r600_context *rctx,
>> @@ -511,6 +527,35 @@ static inline void r600_mark_atom_dirty(struct r600_context *rctx,
>> r600_set_atom_dirty(rctx, atom, true);
>> }
>>
>> +static inline unsigned int r600_next_dirty_atom(struct r600_context *rctx,
>> + unsigned int id)
>> +{
>> +#if !defined(DEBUG) && defined(HAVE___BUILTIN_CTZ)
>> + unsigned int w = id / R600_DIRTY_ATOM_WORD_BITS;
>> + unsigned int bit = id % R600_DIRTY_ATOM_WORD_BITS;
>> + unsigned long bits, mask = (1ul << bit) - 1;
>> +
>> + for (; w < R600_DIRTY_ATOM_ARRAY_LEN; w++, mask = 0ul) {
>> + bits = rctx->dirty_atoms[w] & ~mask;
>> + if (bits == 0)
>> + continue;
>> + return w * R600_DIRTY_ATOM_WORD_BITS + __builtin_ctzl(bits);
>> + }
>> +
>> + return R600_NUM_ATOMS;
>> +#else
>> + for (; id < R600_NUM_ATOMS; id++) {
>> + bool dirty = !!(rctx->dirty_atoms[id / R600_DIRTY_ATOM_WORD_BITS] &
>> + (1ul << (id % R600_DIRTY_ATOM_WORD_BITS)));
>> + assert(dirty == (rctx->atoms[id] && rctx->atoms[id]->dirty));
>> + if (dirty)
>> + break;
>> + }
>> +
>> + return id;
>> +#endif
>> +}
>> +
>> void r600_trace_emit(struct r600_context *rctx);
>>
>> static inline void r600_emit_atom(struct r600_context *rctx, struct r600_atom *atom)
>> diff --git a/src/gallium/drivers/r600/r600_state_common.c b/src/gallium/drivers/r600/r600_state_common.c
>> index a4238a2..8d0942f 100644
>> --- a/src/gallium/drivers/r600/r600_state_common.c
>> +++ b/src/gallium/drivers/r600/r600_state_common.c
>> @@ -54,6 +54,7 @@ void r600_add_atom(struct r600_context *rctx,
>> assert(id < R600_NUM_ATOMS);
>> assert(rctx->atoms[id] == NULL);
>> rctx->atoms[id] = atom;
>> + atom->id = id;
>> atom->dirty = false;
>> }
>>
>> @@ -1476,11 +1477,10 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info
>> r600_need_cs_space(rctx, ib.user_buffer ? 5 : 0, TRUE);
>> r600_flush_emit(rctx);
>>
>> - for (i = 0; i < R600_NUM_ATOMS; i++) {
>> - if (rctx->atoms[i] == NULL || !rctx->atoms[i]->dirty) {
>> - continue;
>> - }
>> + i = r600_next_dirty_atom(rctx, 0);
>> + while (i < R600_NUM_ATOMS) {
>> r600_emit_atom(rctx, rctx->atoms[i]);
>> + i = r600_next_dirty_atom(rctx, i + 1);
>> }
>>
>> if (rctx->b.chip_class == CAYMAN) {
>> diff --git a/src/gallium/drivers/radeon/r600_pipe_common.h b/src/gallium/drivers/radeon/r600_pipe_common.h
>> index 717e248..17da41e 100644
>> --- a/src/gallium/drivers/radeon/r600_pipe_common.h
>> +++ b/src/gallium/drivers/radeon/r600_pipe_common.h
>> @@ -314,6 +314,7 @@ struct r600_common_screen {
>> struct r600_atom {
>> void (*emit)(struct r600_common_context *ctx, struct r600_atom *state);
>> unsigned num_dw;
>> + unsigned short id; /* used by r600 only */
>> bool dirty;
>> };
>>
>> --
>> 1.9.1
>>
>> _______________________________________________
>> mesa-dev mailing list
>> mesa-dev at lists.freedesktop.org
>> http://lists.freedesktop.org/mailman/listinfo/mesa-dev
More information about the mesa-dev
mailing list