[Mesa-dev] [PATCH 1/2] i965/cs: Setup push constant data for uniforms

Ben Widawsky ben at bwidawsk.net
Thu Jul 9 17:15:14 PDT 2015


On Tue, Jun 16, 2015 at 02:21:39PM -0700, Jordan Justen wrote:
> brw_upload_cs_push_constants was based on gen6_upload_push_constants.

This review is based off of 2838833bfd5eb0a87fdacfa1cd6391b50f9c0b8b in your
repository. This patch doesn't apply cleanly in its current form.

> 
> Signed-off-by: Jordan Justen <jordan.l.justen at intel.com>
> ---
>  These 2 patches allow this piglit to pass:
>  tests/spec/arb_compute_shader/execution/basic-uniform-access-atomic.shader_test
>  (Also requires overriding the GL version and some extensions...)
> 
>  src/mesa/drivers/dri/i965/brw_context.h      |   2 +-
>  src/mesa/drivers/dri/i965/brw_cs.cpp         | 119 ++++++++++++++++++++++++++-
>  src/mesa/drivers/dri/i965/brw_defines.h      |   6 ++
>  src/mesa/drivers/dri/i965/brw_state.h        |   1 +
>  src/mesa/drivers/dri/i965/brw_state_upload.c |   2 +
>  5 files changed, 125 insertions(+), 5 deletions(-)
> 
> diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
> index 01c4283..9ea0dfd 100644
> --- a/src/mesa/drivers/dri/i965/brw_context.h
> +++ b/src/mesa/drivers/dri/i965/brw_context.h
> @@ -1457,7 +1457,7 @@ struct brw_context
>  
>     int num_atoms[BRW_NUM_PIPELINES];
>     const struct brw_tracked_state render_atoms[57];
> -   const struct brw_tracked_state compute_atoms[3];
> +   const struct brw_tracked_state compute_atoms[4];
>  
>     /* If (INTEL_DEBUG & DEBUG_BATCH) */
>     struct {
> diff --git a/src/mesa/drivers/dri/i965/brw_cs.cpp b/src/mesa/drivers/dri/i965/brw_cs.cpp
> index 44c76ba..e26d576 100644
> --- a/src/mesa/drivers/dri/i965/brw_cs.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_cs.cpp
> @@ -320,6 +320,9 @@ brw_upload_cs_state(struct brw_context *brw)
>                                              prog_data->binding_table.size_bytes,
>                                              32, &stage_state->bind_bo_offset);
>  
> +   unsigned push_constant_size =
> +      prog_data->nr_params * sizeof(gl_constant_value);
> +   unsigned reg_aligned_constant_size = ALIGN(push_constant_size, 32);

It seems 64b alignment is a requirement on BDW for the MEDIA_CURBE_LOAD.

>     unsigned threads = get_cs_thread_count(cs_prog_data);
>  

Also, you either need to apply the workaround required (MI_ATOMIC), or stick in
an assert and deal with the workaround later.
/* FINISHME: WaSomeNameFillMeIn */
if (brw->gen >= 8)
	assert (reg_aligned_constant_size * threads <= 4032);

>     uint32_t dwords = brw->gen < 8 ? 8 : 9;
> @@ -352,12 +355,24 @@ brw_upload_cs_state(struct brw_context *brw)
>  
>     OUT_BATCH(0);
>     const uint32_t vfe_urb_allocation = brw->gen >= 8 ? 2 : 0;
> -   OUT_BATCH(SET_FIELD(vfe_urb_allocation, MEDIA_VFE_STATE_URB_ALLOC));
> +   const uint32_t vfe_curbe_allocation =
> +      (reg_aligned_constant_size / 32) * threads + 32;

I can't make sense out of the additional 32 at the end. What is that for? If
it's the descriptor entries, this looks wrong for HSW (and beyond).

It seems like from the Indirect Payload Storage section of the docs (which is
specific to GEN8+, but has the legacy way), there is a way to specify cross
thread data. Maybe you can look into this some more so I don't have to.

> +   OUT_BATCH(SET_FIELD(vfe_urb_allocation, MEDIA_VFE_STATE_URB_ALLOC) |
> +             SET_FIELD(vfe_curbe_allocation, MEDIA_VFE_STATE_CURBE_ALLOC));
>     OUT_BATCH(0);
>     OUT_BATCH(0);
>     OUT_BATCH(0);
>     ADVANCE_BATCH();
>  
> +   if (reg_aligned_constant_size > 0) {
> +      BEGIN_BATCH(4);
> +      OUT_BATCH(MEDIA_CURBE_LOAD << 16 | (4 - 2));
> +      OUT_BATCH(0);
> +      OUT_BATCH(reg_aligned_constant_size * threads);
> +      OUT_BATCH(stage_state->push_const_offset);
> +      ADVANCE_BATCH();
> +   }
> +
>     /* BRW_NEW_SURFACES and BRW_NEW_*_CONSTBUF */
>     memcpy(bind, stage_state->surf_offset,
>            prog_data->binding_table.size_bytes);
> @@ -371,7 +386,8 @@ brw_upload_cs_state(struct brw_context *brw)
>     desc[dw++] = 0;
>     desc[dw++] = 0;
>     desc[dw++] = stage_state->bind_bo_offset;
> -   desc[dw++] = 0;
> +   desc[dw++] = SET_FIELD((reg_aligned_constant_size / 32) + 0,
> +                          MEDIA_CURBE_READ_LENGTH);
>     const uint32_t media_threads =
>        brw->gen >= 8 ?
>        SET_FIELD(threads, GEN8_MEDIA_GPGPU_THREAD_COUNT) :
> @@ -392,8 +408,103 @@ const struct brw_tracked_state brw_cs_state = {
>     /* explicit initialisers aren't valid C++, comment
>      * them for documentation purposes */
>     /* .dirty = */{
> -      /* .mesa = */ 0,
> -      /* .brw = */  BRW_NEW_CS_PROG_DATA,
> +      /* .mesa = */ _NEW_PROGRAM_CONSTANTS,
> +      /* .brw = */  BRW_NEW_CS_PROG_DATA |
> +                    BRW_NEW_PUSH_CONSTANT_ALLOCATION,
>     },
>     /* .emit = */ brw_upload_cs_state
>  };
> +
> +
> +/**
> + * Creates a region containing the push constants for the CS on gen7+.
> + *
> + * Push constants are constant values (such as GLSL uniforms) that are
> + * pre-loaded into a shader stage's register space at thread spawn time.
> + *
> + * Not all GLSL uniforms will be uploaded as push constants: The hardware has
> + * a limitation of 32 or 64 EU registers (256 or 512 floats) per stage to be
> + * uploaded as push constants, while GL 4.4 requires at least 1024 components
> + * to be usable for the VS.  Plus, currently we always use pull constants

VS?

> + * instead of push constants when doing variable-index array access.
> + *
> + * For other stages, see brw_curbe.c for the equivalent gen4/5 code and
> + * gen6_vs_state.c for gen6+.
> + */
> +static void
> +brw_upload_cs_push_constants(struct brw_context *brw,
> +                             const struct gl_program *prog,
> +                             const struct brw_cs_prog_data *cs_prog_data,
> +                             struct brw_stage_state *stage_state,
> +                             enum aub_state_struct_type type)
> +{
> +   struct gl_context *ctx = &brw->ctx;
> +   const struct brw_stage_prog_data *prog_data =
> +      (brw_stage_prog_data*) cs_prog_data;
> +
> +   /* Updates the ParamaterValues[i] pointers for all parameters of the
> +    * basic type of PROGRAM_STATE_VAR.
> +    */
> +   /* XXX: Should this happen somewhere before to get our state flag set? */
> +   _mesa_load_state_parameters(ctx, prog->Parameters);
> +
> +   if (prog_data->nr_params == 0) {
> +      stage_state->push_const_size = 0;
> +   } else {
> +      gl_constant_value *param;
> +      unsigned i, t;
> +
> +      const unsigned push_constant_size =
> +         prog_data->nr_params * sizeof(gl_constant_value);
> +      const unsigned param_aligned_count = ALIGN(push_constant_size, 8);
> +      const unsigned reg_aligned_size = 8 * param_aligned_count;
> +
> +      unsigned threads = get_cs_thread_count(cs_prog_data);
> +
> +      param = (gl_constant_value*)
> +         brw_state_batch(brw, type,
> +                         reg_aligned_size * threads,
> +                         32, &stage_state->push_const_offset);
> +      assert(param);
> +
> +      STATIC_ASSERT(sizeof(gl_constant_value) == sizeof(float));
> +
> +      /* _NEW_PROGRAM_CONSTANTS */
> +      for (t = 0; t < threads; t++) {
> +         for (i = 0; i < prog_data->nr_params; i++) {
> +            param[t * param_aligned_count + i] = *prog_data->param[i];
> +         }
> +      }
> +
> +      stage_state->push_const_size = ALIGN(prog_data->nr_params, 8) / 8;
> +   }
> +}
> +
> +
> +static void
> +gen7_upload_cs_push_constants(struct brw_context *brw)
> +{
> +   struct brw_stage_state *stage_state = &brw->cs.base;
> +
> +   /* BRW_NEW_COMPUTE_PROGRAM */
> +   const struct brw_compute_program *cp =
> +      (struct brw_compute_program *) brw->compute_program;
> +
> +   if (cp) {
> +      /* CACHE_NEW_CS_PROG */
> +      struct brw_cs_prog_data *cs_prog_data = brw->cs.prog_data;
> +
> +      brw_upload_cs_push_constants(brw, &cp->program.Base, cs_prog_data,
> +                                   stage_state, AUB_TRACE_WM_CONSTANTS);

I'm uncertain, but I wonder if we need to flag that you've destroyed the URB
state like it's done in gen7_push_constant_space state atom.

> +   }
> +}
> +
> +
> +const struct brw_tracked_state gen7_cs_push_constants = {
> +   /* .dirty = */{
> +      /* .mesa  = */ _NEW_PROGRAM_CONSTANTS,
> +      /* .brw   = */ BRW_NEW_COMPUTE_PROGRAM |
> +                     BRW_NEW_PUSH_CONSTANT_ALLOCATION,
> +   },
> +   /* .emit = */ gen7_upload_cs_push_constants,
> +};
> diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h
> index 52009b3..2304fd6 100644
> --- a/src/mesa/drivers/dri/i965/brw_defines.h
> +++ b/src/mesa/drivers/dri/i965/brw_defines.h
> @@ -2507,7 +2507,13 @@ enum brw_wm_barycentric_interp_mode {
>  # define MEDIA_VFE_STATE_CURBE_ALLOC_SHIFT      0
>  # define MEDIA_VFE_STATE_CURBE_ALLOC_MASK       INTEL_MASK(15, 0)
>  
> +#define MEDIA_CURBE_LOAD                        0x7001
>  #define MEDIA_INTERFACE_DESCRIPTOR_LOAD         0x7002
> +/* GEN7 DW4, GEN8+ DW5 */
> +# define MEDIA_CURBE_READ_LENGTH_SHIFT          16
> +# define MEDIA_CURBE_READ_LENGTH_MASK           INTEL_MASK(31, 16)
> +# define MEDIA_CURBE_READ_OFFSET_SHIFT          0
> +# define MEDIA_CURBE_READ_OFFSET_MASK           INTEL_MASK(15, 0)
>  /* GEN7 DW5, GEN8+ DW6 */
>  # define MEDIA_GPGPU_THREAD_COUNT_SHIFT         0
>  # define MEDIA_GPGPU_THREAD_COUNT_MASK          INTEL_MASK(7, 0)
> diff --git a/src/mesa/drivers/dri/i965/brw_state.h b/src/mesa/drivers/dri/i965/brw_state.h
> index 987672f..a130040 100644
> --- a/src/mesa/drivers/dri/i965/brw_state.h
> +++ b/src/mesa/drivers/dri/i965/brw_state.h
> @@ -95,6 +95,7 @@ extern const struct brw_tracked_state brw_indices;
>  extern const struct brw_tracked_state brw_vertices;
>  extern const struct brw_tracked_state brw_index_buffer;
>  extern const struct brw_tracked_state brw_cs_state;
> +extern const struct brw_tracked_state gen7_cs_push_constants;
>  extern const struct brw_tracked_state gen6_binding_table_pointers;
>  extern const struct brw_tracked_state gen6_blend_state;
>  extern const struct brw_tracked_state gen6_cc_state_pointers;
> diff --git a/src/mesa/drivers/dri/i965/brw_state_upload.c b/src/mesa/drivers/dri/i965/brw_state_upload.c
> index 84b0861..dba506b 100644
> --- a/src/mesa/drivers/dri/i965/brw_state_upload.c
> +++ b/src/mesa/drivers/dri/i965/brw_state_upload.c
> @@ -250,6 +250,7 @@ static const struct brw_tracked_state *gen7_render_atoms[] =
>  static const struct brw_tracked_state *gen7_compute_atoms[] =
>  {
>     &brw_state_base_address,
> +   &gen7_cs_push_constants,
>     &brw_cs_abo_surfaces,
>     &brw_cs_state,
>  };
> @@ -333,6 +334,7 @@ static const struct brw_tracked_state *gen8_render_atoms[] =
>  static const struct brw_tracked_state *gen8_compute_atoms[] =
>  {
>     &gen8_state_base_address,
> +   &gen7_cs_push_constants,
>     &brw_cs_abo_surfaces,
>     &brw_cs_state,
>  };

I'll take another look at this after we do some more investigation on CURBE
sharing between threads.


More information about the mesa-dev mailing list