[Mesa-dev] [PATCH 18/20] i965: Program the push constants state using the gather table
Abdiel Janulgue
abdiel.janulgue at linux.intel.com
Wed Oct 28 00:09:35 PDT 2015
On 10/27/2015 03:18 PM, Francisco Jerez wrote:
> Francisco Jerez <currojerez at riseup.net> writes:
>
>> Abdiel Janulgue <abdiel.janulgue at linux.intel.com> writes:
>>
>>> Use the gather table generated from the uniform uploads and
>>> ir_binop_ubo_load to gather and pack the constants to the gather pool.
>>>
>>> Note that the 3DSTATE_CONSTANT_* packet now refers to the gather
>>> pool generated by the resource streamer instead of the constant buffer
>>> pointed to by an offset of the dynamic state base address.
>>>
>>> v2: Support GEN8 + non-trivial rebase.
>>>
>>> Signed-off-by: Abdiel Janulgue <abdiel.janulgue at linux.intel.com>
>>> ---
>>> src/mesa/drivers/dri/i965/brw_state.h | 2 +-
>>> src/mesa/drivers/dri/i965/gen6_gs_state.c | 2 +-
>>> src/mesa/drivers/dri/i965/gen6_vs_state.c | 2 +-
>>> src/mesa/drivers/dri/i965/gen6_wm_state.c | 2 +-
>>> src/mesa/drivers/dri/i965/gen7_vs_state.c | 84 +++++++++++++++++++++++++++----
>>> 5 files changed, 79 insertions(+), 13 deletions(-)
>>>
>>> diff --git a/src/mesa/drivers/dri/i965/brw_state.h b/src/mesa/drivers/dri/i965/brw_state.h
>>> index c7c7e0b..a1e6c73 100644
>>> --- a/src/mesa/drivers/dri/i965/brw_state.h
>>> +++ b/src/mesa/drivers/dri/i965/brw_state.h
>>> @@ -361,7 +361,7 @@ brw_upload_pull_constants(struct brw_context *brw,
>>> void
>>> gen7_upload_constant_state(struct brw_context *brw,
>>> const struct brw_stage_state *stage_state,
>>> - bool active, unsigned opcode);
>>> + bool active, unsigned opcode, unsigned gather_op);
>>>
>>> void gen7_rs_control(struct brw_context *brw, int enable);
>>>
>>> diff --git a/src/mesa/drivers/dri/i965/gen6_gs_state.c b/src/mesa/drivers/dri/i965/gen6_gs_state.c
>>> index eb4c586..79a899e 100644
>>> --- a/src/mesa/drivers/dri/i965/gen6_gs_state.c
>>> +++ b/src/mesa/drivers/dri/i965/gen6_gs_state.c
>>> @@ -48,7 +48,7 @@ gen6_upload_gs_push_constants(struct brw_context *brw)
>>> }
>>>
>>> if (brw->gen >= 7)
>>> - gen7_upload_constant_state(brw, stage_state, gp, _3DSTATE_CONSTANT_GS);
>>> + gen7_upload_constant_state(brw, stage_state, gp, _3DSTATE_CONSTANT_GS, _3DSTATE_GATHER_CONSTANT_GS);
>>> }
>>>
>>> const struct brw_tracked_state gen6_gs_push_constants = {
>>> diff --git a/src/mesa/drivers/dri/i965/gen6_vs_state.c b/src/mesa/drivers/dri/i965/gen6_vs_state.c
>>> index d43af5b..bb375b9 100644
>>> --- a/src/mesa/drivers/dri/i965/gen6_vs_state.c
>>> +++ b/src/mesa/drivers/dri/i965/gen6_vs_state.c
>>> @@ -174,7 +174,7 @@ gen6_upload_vs_push_constants(struct brw_context *brw)
>>> gen7_emit_vs_workaround_flush(brw);
>>>
>>> gen7_upload_constant_state(brw, stage_state, true /* active */,
>>> - _3DSTATE_CONSTANT_VS);
>>> + _3DSTATE_CONSTANT_VS, _3DSTATE_GATHER_CONSTANT_VS);
>>> }
>>> }
>>>
>>> diff --git a/src/mesa/drivers/dri/i965/gen6_wm_state.c b/src/mesa/drivers/dri/i965/gen6_wm_state.c
>>> index d1748ba..c7b37c6 100644
>>> --- a/src/mesa/drivers/dri/i965/gen6_wm_state.c
>>> +++ b/src/mesa/drivers/dri/i965/gen6_wm_state.c
>>> @@ -51,7 +51,7 @@ gen6_upload_wm_push_constants(struct brw_context *brw)
>>>
>>> if (brw->gen >= 7) {
>>> gen7_upload_constant_state(brw, &brw->wm.base, true,
>>> - _3DSTATE_CONSTANT_PS);
>>> + _3DSTATE_CONSTANT_PS, _3DSTATE_GATHER_CONSTANT_PS);
>>> }
>>> }
>>>
>>> diff --git a/src/mesa/drivers/dri/i965/gen7_vs_state.c b/src/mesa/drivers/dri/i965/gen7_vs_state.c
>>> index b7e4858..cf07658 100644
>>> --- a/src/mesa/drivers/dri/i965/gen7_vs_state.c
>>> +++ b/src/mesa/drivers/dri/i965/gen7_vs_state.c
>>> @@ -28,19 +28,70 @@
>>> #include "program/prog_parameter.h"
>>> #include "program/prog_statevars.h"
>>> #include "intel_batchbuffer.h"
>>> +#include "glsl/glsl_parser_extras.h"
>>>
>>> +static void
>>> +gen7_submit_gather_table(struct brw_context* brw,
>>> + const struct brw_stage_state *stage_state,
>>> + const struct brw_stage_prog_data *prog_data,
>>> + unsigned gather_opcode)
>>> +{
>>> + uint32_t gather_dwords = 3 + prog_data->nr_gather_table;
>>> +
>>> + /* Ordinary uniforms are assigned to the first constant buffer slot */
>>> + unsigned cb_valid = 1;
>>> + /* Assign subsequent constant buffer slots to UBOs if any */
>>> + cb_valid |= (prog_data->nr_ubo_params > 0) ?
>>> + (2 << (BRW_UBO_GATHER_INDEX_APPEND + prog_data->max_ubo_const_block)) - 1 : 0;
>>> +
>>> + assert(cb_valid < 0xffff);
>>> +
>>> + BEGIN_BATCH(gather_dwords);
>>> + OUT_BATCH(gather_opcode << 16 | (gather_dwords - 2));
>>> + OUT_BATCH(SET_FIELD(cb_valid, BRW_GATHER_BUFFER_VALID) |
>>> + SET_FIELD(BRW_UNIFORM_GATHER_INDEX_START / 16,
>>> + BRW_GATHER_BINDING_TABLE_BLOCK));
>>> + OUT_BATCH(stage_state->push_const_offset);
>>> + for (int i = 0; i < prog_data->nr_gather_table; i++) {
>>> + /* Which bo are we referring to? The uniform constant buffer or
>>> + * the UBO block?
>>> + */
>>> + bool is_uniform = prog_data->gather_table[i].reg == -1;
>>> + int cb_offset = is_uniform ? i :
>>> + (prog_data->gather_table[i].const_offset / 16);
>>> + int bt_offset = is_uniform ? 0 :
>>> + (prog_data->gather_table[i].const_block + BRW_UBO_GATHER_INDEX_APPEND);
>>> +
>>> + assert(cb_offset < 256);
>>> + assert(bt_offset < 16);
>>> +
>>> + OUT_BATCH(SET_FIELD(cb_offset, BRW_GATHER_CONST_BUFFER_OFFSET) |
>>> + SET_FIELD(prog_data->gather_table[i].channel_mask,
>>> + BRW_GATHER_CHANNEL_MASK) | bt_offset);
>>> + }
>>> + ADVANCE_BATCH();
>>
>> This is called in response to the _NEW_PROGRAM_CONSTANTS state flag,
>> e.g. when uniforms from the default block are updated using
>> glUniform*(), but not necessarily if, say, a new buffer object is bound
>> to a UBO binding point -- Or worse, if something (e.g. glBufferSubData)
>> somehow modifies a buffer object while it's already bound to the
>> pipeline as UBO. It doesn't look like this would notice that a buffer
>> has been modified and resend appropriate _3DSTATE_GATHER_CONSTANT_*S
>> commands so that the new buffer contents are re-gathered.
>>
>> I suspect that my "arb_shader_image_load_store/host-mem-barrier/Uniform
>> buffer/RaW" piglit test would almost reproduce the issue (it binds the
>> same buffer as UBO and shader image simultaneously, then writes to the
>> buffer using imageStore(), calls glMemoryBarrier(GL_UNIFORM_BARRIER_BIT)
>> so that subsequent uniform memory access is coherent with previous image
>> writes, and then reads back the same UBO from the shader) -- If it
>> weren't because it uses non-constant indexing of the uniform array
>> (which you don't promote to gather constants), and because it changes
>> the value of a uniform between the two draw calls so you might actually
>> end up here by luck. ;)
>>
> I've made a non-exhaustive list of things that can potentially write to
> buffer objects and you should definitely consider. You may have to
> throw away the gather pool if a buffer currently bound as UBO is
> modified using:
>
> - Usual buffer upload commands (glBufferSubData and friends).
> - Buffer object mapping (persistent and coherent mappings are likely
> to require special care).
> - Meta paths that render into a buffer object bound as PBO
> (glReadPixels, glGetTexImage).
> - Shader image stores and atomics.
> - Shader storage buffer objects.
> - Shader atomic counters.
> - Transform feedback.
> - Query objects.
>
> The gl_buffer_object::UsageHistory field may be useful for some of these
> to find out whether you need to invalidate and re-gather your constant
> cache -- E.g. glReadPixels, glGetTexImage and glBufferSubData could just
> check if the target BO has USAGE_UNIFORM_BUFFER set among its usage
> history flags and in that case invalidate the gather constant cache.
> That would be simple but most likely overly pessimistic because the
> UsageHistory bitset won't tell you whether the buffer is still bound as
> UBO -- If it's not you wouldn't necessarily need to throw away the
> contents of the gather constant cache.
>
> To handle images, SSBOs and persistent non-coherent maps you could
> probably just invalidate the whole gather pool whenever glMemoryBarrier
> is called with UNIFORM_BARRIER_BIT or CLIENT_MAPPED_BUFFER_BARRIER_BIT
> set.
>
> The rest may be more difficult, and in some cases (e.g. while a buffer
> bound as UBO is persistently *and* coherently mapped) you may have to
> re-gather all constants prior to every draw call -- I'd expect that to
> be even more expensive than using plain pull constants though, so we
> might be better off disabling UBO gather in such cases even if that
> means doing state-dependent recompiles.
>
Hi Francisco!
Looks exhaustive enough ;) Thanks for this! Wouldn't have noticed these
corner cases at all without your help. I'll try to address the
synchronisation issues in the next version!
-Abdiel
More information about the mesa-dev
mailing list