[Mesa-dev] [PATCH] i965/cs: Setup push constant data for uniforms
Jordan Justen
jordan.l.justen at intel.com
Mon Aug 3 23:03:40 PDT 2015
I added this patch to the 7 patch 'i965/cs: gl_LocalInvocationID
support' series, so go review it there instead. :)
-Jordan
On 2015-07-31 10:44:13, Jordan Justen wrote:
> brw_upload_cs_push_constants was based on gen6_upload_push_constants.
>
> v2:
> * Add FINISHME comments about more efficient ways to push uniforms
>
> Signed-off-by: Jordan Justen <jordan.l.justen at intel.com>
> Cc: Ben Widawsky <ben at bwidawsk.net>
> ---
> Ben, Regarding your v1 feedback:
>
> * I looked into the other mechanisms for uploading uniform data once,
> rather than once per local workgroup thread. They look compelling,
> but for now I just added 'FINISHME' comments to document them.
>
> * I think that the MI_ATOMIC workaround is only needed if the new bdw
> "Indirect Payload Storage" is used.
>
> src/mesa/drivers/dri/i965/brw_context.h | 2 +-
> src/mesa/drivers/dri/i965/brw_cs.cpp | 131 ++++++++++++++++++++++++++-
> src/mesa/drivers/dri/i965/brw_defines.h | 6 ++
> src/mesa/drivers/dri/i965/brw_state.h | 1 +
> src/mesa/drivers/dri/i965/brw_state_upload.c | 2 +
> 5 files changed, 137 insertions(+), 5 deletions(-)
>
> diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
> index cd43ac5..0bc497b 100644
> --- a/src/mesa/drivers/dri/i965/brw_context.h
> +++ b/src/mesa/drivers/dri/i965/brw_context.h
> @@ -1460,7 +1460,7 @@ struct brw_context
>
> int num_atoms[BRW_NUM_PIPELINES];
> const struct brw_tracked_state render_atoms[57];
> - const struct brw_tracked_state compute_atoms[3];
> + const struct brw_tracked_state compute_atoms[4];
>
> /* If (INTEL_DEBUG & DEBUG_BATCH) */
> struct {
> diff --git a/src/mesa/drivers/dri/i965/brw_cs.cpp b/src/mesa/drivers/dri/i965/brw_cs.cpp
> index 29ee75b..28eddfc 100644
> --- a/src/mesa/drivers/dri/i965/brw_cs.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_cs.cpp
> @@ -327,6 +327,10 @@ brw_upload_cs_state(struct brw_context *brw)
> prog_data->binding_table.size_bytes,
> 32, &stage_state->bind_bo_offset);
>
> + unsigned push_constant_data_size =
> + prog_data->nr_params * sizeof(gl_constant_value);
> + unsigned reg_aligned_constant_size = ALIGN(push_constant_data_size, 32);
> + unsigned push_constant_regs = reg_aligned_constant_size / 32;
> unsigned threads = get_cs_thread_count(cs_prog_data);
>
> uint32_t dwords = brw->gen < 8 ? 8 : 9;
> @@ -359,12 +363,41 @@ brw_upload_cs_state(struct brw_context *brw)
>
> OUT_BATCH(0);
> const uint32_t vfe_urb_allocation = brw->gen >= 8 ? 2 : 0;
> - OUT_BATCH(SET_FIELD(vfe_urb_allocation, MEDIA_VFE_STATE_URB_ALLOC));
> +
> + /* We are uploading duplicated copies of push constant uniforms for each
> + * thread. Although the local id data needs to vary per thread, it won't
> + * change for other uniform data. Unfortunately this duplication is
> + * required for gen7. As of Haswell, this duplication can be avoided, but
> + * this older mechanism with duplicated data continues to work.
> + *
> + * FINISHME: As of Haswell, we could make use of the
> + * INTERFACE_DESCRIPTOR_DATA "Cross-Thread Constant Data Read Length" field
> + * to only store one copy of uniform data.
> + *
> + * FINISHME: Broadwell adds a new alternative "Indirect Payload Storage"
> + * which is described in the GPGPU_WALKER command and in the Broadwell PRM
> + * Volume 7: 3D Media GPGPU, under Media GPGPU Pipeline => Mode of
> + * Operations => GPGPU Mode => Indirect Payload Storage.
> + *
> + * Note: The constant data is built in brw_upload_cs_push_constants below.
> + */
> + const uint32_t vfe_curbe_allocation = push_constant_regs * threads;
> + OUT_BATCH(SET_FIELD(vfe_urb_allocation, MEDIA_VFE_STATE_URB_ALLOC) |
> + SET_FIELD(vfe_curbe_allocation, MEDIA_VFE_STATE_CURBE_ALLOC));
> OUT_BATCH(0);
> OUT_BATCH(0);
> OUT_BATCH(0);
> ADVANCE_BATCH();
>
> + if (reg_aligned_constant_size > 0) {
> + BEGIN_BATCH(4);
> + OUT_BATCH(MEDIA_CURBE_LOAD << 16 | (4 - 2));
> + OUT_BATCH(0);
> + OUT_BATCH(reg_aligned_constant_size * threads);
> + OUT_BATCH(stage_state->push_const_offset);
> + ADVANCE_BATCH();
> + }
> +
> /* BRW_NEW_SURFACES and BRW_NEW_*_CONSTBUF */
> memcpy(bind, stage_state->surf_offset,
> prog_data->binding_table.size_bytes);
> @@ -378,7 +411,7 @@ brw_upload_cs_state(struct brw_context *brw)
> desc[dw++] = 0;
> desc[dw++] = 0;
> desc[dw++] = stage_state->bind_bo_offset;
> - desc[dw++] = 0;
> + desc[dw++] = SET_FIELD(push_constant_regs, MEDIA_CURBE_READ_LENGTH);
> const uint32_t media_threads =
> brw->gen >= 8 ?
> SET_FIELD(threads, GEN8_MEDIA_GPGPU_THREAD_COUNT) :
> @@ -400,8 +433,98 @@ const struct brw_tracked_state brw_cs_state = {
> /* explicit initialisers aren't valid C++, comment
> * them for documentation purposes */
> /* .dirty = */{
> - /* .mesa = */ 0,
> - /* .brw = */ BRW_NEW_CS_PROG_DATA,
> + /* .mesa = */ _NEW_PROGRAM_CONSTANTS,
> + /* .brw = */ BRW_NEW_CS_PROG_DATA |
> + BRW_NEW_PUSH_CONSTANT_ALLOCATION,
> },
> /* .emit = */ brw_upload_cs_state
> };
> +
> +
> +/**
> + * Creates a region containing the push constants for the CS on gen7+.
> + *
> + * Push constants are constant values (such as GLSL uniforms) that are
> + * pre-loaded into a shader stage's register space at thread spawn time.
> + *
> + * For other stages, see brw_curbe.c:brw_upload_constant_buffer for the
> + * equivalent gen4/5 code and gen6_vs_state.c:gen6_upload_push_constants for
> + * gen6+.
> + */
> +static void
> +brw_upload_cs_push_constants(struct brw_context *brw,
> + const struct gl_program *prog,
> + const struct brw_cs_prog_data *cs_prog_data,
> + struct brw_stage_state *stage_state,
> + enum aub_state_struct_type type)
> +{
> + struct gl_context *ctx = &brw->ctx;
> + const struct brw_stage_prog_data *prog_data =
> + (brw_stage_prog_data*) cs_prog_data;
> +
> + /* Updates the ParamaterValues[i] pointers for all parameters of the
> + * basic type of PROGRAM_STATE_VAR.
> + */
> + /* XXX: Should this happen somewhere before to get our state flag set? */
> + _mesa_load_state_parameters(ctx, prog->Parameters);
> +
> + if (prog_data->nr_params == 0) {
> + stage_state->push_const_size = 0;
> + } else {
> + gl_constant_value *param;
> + unsigned i, t;
> +
> + const unsigned push_constant_data_size =
> + prog_data->nr_params * sizeof(gl_constant_value);
> + const unsigned param_aligned_count = ALIGN(push_constant_data_size, 8);
> + const unsigned reg_aligned_size = 8 * param_aligned_count;
> +
> + unsigned threads = get_cs_thread_count(cs_prog_data);
> +
> + param = (gl_constant_value*)
> + brw_state_batch(brw, type,
> + reg_aligned_size * threads,
> + 32, &stage_state->push_const_offset);
> + assert(param);
> +
> + STATIC_ASSERT(sizeof(gl_constant_value) == sizeof(float));
> +
> + /* _NEW_PROGRAM_CONSTANTS */
> + for (t = 0; t < threads; t++) {
> + for (i = 0; i < prog_data->nr_params; i++) {
> + param[t * param_aligned_count + i] = *prog_data->param[i];
> + }
> + }
> +
> + stage_state->push_const_size = ALIGN(prog_data->nr_params, 8) / 8;
> + }
> +}
> +
> +
> +static void
> +gen7_upload_cs_push_constants(struct brw_context *brw)
> +{
> + struct brw_stage_state *stage_state = &brw->cs.base;
> +
> + /* BRW_NEW_COMPUTE_PROGRAM */
> + const struct brw_compute_program *cp =
> + (struct brw_compute_program *) brw->compute_program;
> +
> + if (cp) {
> + /* CACHE_NEW_CS_PROG */
> + struct brw_cs_prog_data *cs_prog_data = brw->cs.prog_data;
> +
> + brw_upload_cs_push_constants(brw, &cp->program.Base, cs_prog_data,
> + stage_state, AUB_TRACE_WM_CONSTANTS);
> + }
> +}
> +
> +
> +const struct brw_tracked_state gen7_cs_push_constants = {
> + /* .dirty = */{
> + /* .mesa = */ _NEW_PROGRAM_CONSTANTS,
> + /* .brw = */ BRW_NEW_COMPUTE_PROGRAM |
> + BRW_NEW_PUSH_CONSTANT_ALLOCATION,
> + },
> + /* .emit = */ gen7_upload_cs_push_constants,
> +};
> diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h
> index f595366..fb72c3e 100644
> --- a/src/mesa/drivers/dri/i965/brw_defines.h
> +++ b/src/mesa/drivers/dri/i965/brw_defines.h
> @@ -2613,7 +2613,13 @@ enum brw_wm_barycentric_interp_mode {
> # define MEDIA_VFE_STATE_CURBE_ALLOC_SHIFT 0
> # define MEDIA_VFE_STATE_CURBE_ALLOC_MASK INTEL_MASK(15, 0)
>
> +#define MEDIA_CURBE_LOAD 0x7001
> #define MEDIA_INTERFACE_DESCRIPTOR_LOAD 0x7002
> +/* GEN7 DW4, GEN8+ DW5 */
> +# define MEDIA_CURBE_READ_LENGTH_SHIFT 16
> +# define MEDIA_CURBE_READ_LENGTH_MASK INTEL_MASK(31, 16)
> +# define MEDIA_CURBE_READ_OFFSET_SHIFT 0
> +# define MEDIA_CURBE_READ_OFFSET_MASK INTEL_MASK(15, 0)
> /* GEN7 DW5, GEN8+ DW6 */
> # define MEDIA_GPGPU_THREAD_COUNT_SHIFT 0
> # define MEDIA_GPGPU_THREAD_COUNT_MASK INTEL_MASK(7, 0)
> diff --git a/src/mesa/drivers/dri/i965/brw_state.h b/src/mesa/drivers/dri/i965/brw_state.h
> index 2eff1b5..4a84782 100644
> --- a/src/mesa/drivers/dri/i965/brw_state.h
> +++ b/src/mesa/drivers/dri/i965/brw_state.h
> @@ -95,6 +95,7 @@ extern const struct brw_tracked_state brw_indices;
> extern const struct brw_tracked_state brw_vertices;
> extern const struct brw_tracked_state brw_index_buffer;
> extern const struct brw_tracked_state brw_cs_state;
> +extern const struct brw_tracked_state gen7_cs_push_constants;
> extern const struct brw_tracked_state gen6_binding_table_pointers;
> extern const struct brw_tracked_state gen6_blend_state;
> extern const struct brw_tracked_state gen6_cc_state_pointers;
> diff --git a/src/mesa/drivers/dri/i965/brw_state_upload.c b/src/mesa/drivers/dri/i965/brw_state_upload.c
> index 6096b49..80d44bb 100644
> --- a/src/mesa/drivers/dri/i965/brw_state_upload.c
> +++ b/src/mesa/drivers/dri/i965/brw_state_upload.c
> @@ -253,6 +253,7 @@ static const struct brw_tracked_state *gen7_render_atoms[] =
> static const struct brw_tracked_state *gen7_compute_atoms[] =
> {
> &brw_state_base_address,
> + &gen7_cs_push_constants,
> &brw_cs_abo_surfaces,
> &brw_cs_state,
> };
> @@ -338,6 +339,7 @@ static const struct brw_tracked_state *gen8_render_atoms[] =
> static const struct brw_tracked_state *gen8_compute_atoms[] =
> {
> &gen8_state_base_address,
> + &gen7_cs_push_constants,
> &brw_cs_abo_surfaces,
> &brw_cs_state,
> };
> --
> 2.1.4
>
More information about the mesa-dev
mailing list