[Mesa-dev] [PATCH v3 19/22] nir/i965: use two slots from inputs_read for dvec3/dvec4 vertex input attributes
Jason Ekstrand
jason at jlekstrand.net
Thu Jan 5 20:50:15 UTC 2017
On Thu, Jan 5, 2017 at 2:18 AM, Samuel Iglesias Gonsálvez <
siglesias at igalia.com> wrote:
> From: "Juan A. Suarez Romero" <jasuarez at igalia.com>
>
> So far, input_reads was a bitmap tracking which vertex input locations
> were being used.
>
> In OpenGL, an attribute bigger than a vec4 (like a dvec3 or dvec4)
> consumes just one location, any other small attribute. So we mark the
> proper bit in inputs_read, and also the same bit in double_inputs_read
> if the attribute is a dvec3/dvec4.
>
> But in Vulkan, this is slightly different: a dvec3/dvec4 attribute
> consumes two locations, not just one. And hence two bits would be marked
> in inputs_read for the same vertex input attribute.
>
> To avoid handling two different situations in NIR, we just choose the
> latest one: in OpenGL, when creating NIR from GLSL/IR, any dvec3/dvec4
> vertex input attribute is marked with two bits in the inputs_read bitmap
> (and also in the double_inputs_read), and following attributes are
> adjusted accordingly.
>
> As example, if in our GLSL/IR shader we have three attributes:
>
> layout(location = 0) vec3 attr0;
> layout(location = 1) dvec4 attr1;
> layout(location = 2) dvec3 attr2;
>
> then in our NIR shader we put attr0 in location 0, attr1 in locations 1
> and 2, and attr2 in location 3 and 4.
>
> Checking carefully, basically we are using slots rather than locations
> in NIR.
>
> When emitting the vertices, we do a inverse map to know the
> corresponding location for each slot.
>
> v2 (Jason):
> - use two slots from inputs_read for dvec3/dvec4 NIR from GLSL/IR.
>
> v3 (Jason):
> - Fix commit log error.
> - Use ladder ifs and fix braces.
> - elements_double is divisible by 2, don't need DIV_ROUND_UP().
> - Use if ladder instead of a switch.
>
> Reviewed-by: Jason Ekstrand <jason at jlekstrand.net>
> ---
> src/compiler/glsl/glsl_to_nir.cpp | 28 ++++++++++++++++
> src/compiler/nir/nir_gather_info.c | 48
> +++++++++++++---------------
> src/intel/vulkan/genX_pipeline.c | 28 ++++++++++++----
> src/mesa/drivers/dri/i965/brw_draw_upload.c | 11 +++++--
> src/mesa/drivers/dri/i965/brw_fs.cpp | 13 --------
> src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 3 +-
> src/mesa/drivers/dri/i965/brw_nir.c | 6 ++--
> src/mesa/drivers/dri/i965/brw_nir.h | 1 -
> src/mesa/drivers/dri/i965/brw_vec4.cpp | 11 +++----
> 9 files changed, 89 insertions(+), 60 deletions(-)
>
> diff --git a/src/compiler/glsl/glsl_to_nir.cpp
> b/src/compiler/glsl/glsl_to_nir.cpp
> index 4e0a33a9e74..1e0a7f0255c 100644
> --- a/src/compiler/glsl/glsl_to_nir.cpp
> +++ b/src/compiler/glsl/glsl_to_nir.cpp
> @@ -129,6 +129,19 @@ private:
>
> } /* end of anonymous namespace */
>
> +static void
> +nir_remap_attributes(nir_shader *shader)
> +{
> + nir_foreach_variable(var, &shader->inputs) {
> + var->data.location += _mesa_bitcount_64(shader->info->double_inputs_read
> &
> + BITFIELD64_MASK(var->data.
> location));
> + }
> +
> + /* Once the remap is done, reset double_inputs_read, so later it will
> have
> + * which location/slots are doubles */
> + shader->info->double_inputs_read = 0;
> +}
> +
> nir_shader *
> glsl_to_nir(const struct gl_shader_program *shader_prog,
> gl_shader_stage stage,
> @@ -146,6 +159,13 @@ glsl_to_nir(const struct gl_shader_program
> *shader_prog,
>
> nir_lower_constant_initializers(shader, (nir_variable_mode)~0);
>
> + /* Remap the locations to slots so those requiring two slots will
> occupy
> + * two locations. For instance, if we have in the IR code a dvec3
> attr0 in
> + * location 0 and vec4 attr1 in location 1, in NIR attr0 will use
> + * locations/slots 0 and 1, and attr1 will use location/slot 2 */
> + if (shader->stage == MESA_SHADER_VERTEX)
> + nir_remap_attributes(shader);
> +
> shader->info->name = ralloc_asprintf(shader, "GLSL%d",
> shader_prog->Name);
> if (shader_prog->Label)
> shader->info->label = ralloc_strdup(shader, shader_prog->Label);
> @@ -318,6 +338,14 @@ nir_visitor::visit(ir_variable *ir)
> } else {
> var->data.mode = nir_var_shader_in;
> }
> +
> + /* Mark all the locations that require two slots */
> + if (glsl_type_is_dual_slot(glsl_without_array(var->type))) {
> + for (uint i = 0; i < glsl_count_attribute_slots(var->type,
> true); i++) {
> + uint64_t bitfield = BITFIELD64_BIT(var->data.location + i);
> + shader->info->double_inputs_read |= bitfield;
> + }
> + }
> break;
>
> case ir_var_shader_out:
> diff --git a/src/compiler/nir/nir_gather_info.c
> b/src/compiler/nir/nir_gather_info.c
> index 07c99497146..35a1ce4dec6 100644
> --- a/src/compiler/nir/nir_gather_info.c
> +++ b/src/compiler/nir/nir_gather_info.c
> @@ -53,11 +53,6 @@ set_io_mask(nir_shader *shader, nir_variable *var, int
> offset, int len)
> else
> shader->info->inputs_read |= bitfield;
>
> - /* double inputs read is only for vertex inputs */
> - if (shader->stage == MESA_SHADER_VERTEX &&
> - glsl_type_is_dual_slot(glsl_without_array(var->type)))
> - shader->info->double_inputs_read |= bitfield;
> -
> if (shader->stage == MESA_SHADER_FRAGMENT) {
> shader->info->fs.uses_sample_qualifier |= var->data.sample;
> }
> @@ -83,26 +78,21 @@ static void
> mark_whole_variable(nir_shader *shader, nir_variable *var)
> {
> const struct glsl_type *type = var->type;
> - bool is_vertex_input = false;
>
> if (nir_is_per_vertex_io(var, shader->stage)) {
> assert(glsl_type_is_array(type));
> type = glsl_get_array_element(type);
> }
>
> - if (shader->stage == MESA_SHADER_VERTEX &&
> - var->data.mode == nir_var_shader_in)
> - is_vertex_input = true;
> -
> const unsigned slots =
> var->data.compact ? DIV_ROUND_UP(glsl_get_length(type), 4)
> - : glsl_count_attribute_slots(type,
> is_vertex_input);
> + : glsl_count_attribute_slots(type, false);
>
> set_io_mask(shader, var, 0, slots);
> }
>
> static unsigned
> -get_io_offset(nir_deref_var *deref, bool is_vertex_input)
> +get_io_offset(nir_deref_var *deref)
> {
> unsigned offset = 0;
>
> @@ -117,7 +107,7 @@ get_io_offset(nir_deref_var *deref, bool
> is_vertex_input)
> return -1;
> }
>
> - offset += glsl_count_attribute_slots(tail->type,
> is_vertex_input) *
> + offset += glsl_count_attribute_slots(tail->type, false) *
> deref_array->base_offset;
> }
> /* TODO: we can get the offset for structs here see nir_lower_io()
> */
> @@ -163,12 +153,7 @@ try_mask_partial_io(nir_shader *shader, nir_deref_var
> *deref)
> return false;
> }
>
> - bool is_vertex_input = false;
> - if (shader->stage == MESA_SHADER_VERTEX &&
> - var->data.mode == nir_var_shader_in)
> - is_vertex_input = true;
> -
> - unsigned offset = get_io_offset(deref, is_vertex_input);
> + unsigned offset = get_io_offset(deref);
> if (offset == -1)
> return false;
>
> @@ -184,8 +169,7 @@ try_mask_partial_io(nir_shader *shader, nir_deref_var
> *deref)
> }
>
> /* double element width for double types that takes two slots */
> - if (!is_vertex_input &&
> - glsl_type_is_dual_slot(glsl_without_array(type))) {
> + if (glsl_type_is_dual_slot(glsl_without_array(type))) {
> elem_width *= 2;
> }
>
> @@ -220,13 +204,27 @@ gather_intrinsic_info(nir_intrinsic_instr *instr,
> nir_shader *shader)
> case nir_intrinsic_interp_var_at_sample:
> case nir_intrinsic_interp_var_at_offset:
> case nir_intrinsic_load_var:
> - case nir_intrinsic_store_var:
> - if (instr->variables[0]->var->data.mode == nir_var_shader_in ||
> - instr->variables[0]->var->data.mode == nir_var_shader_out) {
> + case nir_intrinsic_store_var: {
> + nir_variable *var = instr->variables[0]->var;
> +
> + if (var->data.mode == nir_var_shader_in ||
> + var->data.mode == nir_var_shader_out) {
> if (!try_mask_partial_io(shader, instr->variables[0]))
> - mark_whole_variable(shader, instr->variables[0]->var);
> + mark_whole_variable(shader, var);
> +
> + /* We need to track which input_reads bits correspond to a
> + * dvec3/dvec4 input attribute */
> + if (shader->stage == MESA_SHADER_VERTEX &&
> + var->data.mode == nir_var_shader_in &&
> + glsl_type_is_dual_slot(glsl_without_array(var->type))) {
> + for (uint i = 0; i < glsl_count_attribute_slots(var->type,
> false); i++) {
> + int idx = var->data.location + i;
> + shader->info->double_inputs_read |= BITFIELD64_BIT(idx);
> + }
> + }
> }
> break;
> + }
>
> case nir_intrinsic_load_draw_id:
> case nir_intrinsic_load_front_face:
> diff --git a/src/intel/vulkan/genX_pipeline.c b/src/intel/vulkan/genX_
> pipeline.c
> index 845d0204d8c..48a8fad5965 100644
> --- a/src/intel/vulkan/genX_pipeline.c
> +++ b/src/intel/vulkan/genX_pipeline.c
> @@ -44,7 +44,15 @@ vertex_element_comp_control(enum isl_format format,
> unsigned comp)
>
> if (bits) {
> return VFCOMP_STORE_SRC;
> - } else if (comp < 3) {
> + } else if (comp >= 2 &&
> + !isl_format_layouts[format].channels.b.bits &&
> + isl_format_layouts[format].channels.r.type == ISL_RAW) {
> + /* When emitting 64-bit attributes, we need to write either 128 or
> 256
> + * bit chunks, using VFCOMP_NOSTORE when not writing the chunk, and
> + * VFCOMP_STORE_0 to pad the written chunk */
> + return VFCOMP_NOSTORE;
> + } else if (comp < 3 ||
> + (comp == 3 && isl_format_layouts[format].channels.r.type
> == ISL_RAW)) {
>
You don't need "comp == 3" here. Also, please add a PRM citation for the
restriction on PASSTHROUGH formats. With that, R-B still applies.
> return VFCOMP_STORE_0;
> } else if (isl_format_layouts[format].channels.r.type == ISL_UINT ||
> isl_format_layouts[format].channels.r.type == ISL_SINT) {
> @@ -64,8 +72,10 @@ emit_vertex_input(struct anv_pipeline *pipeline,
>
> /* Pull inputs_read out of the VS prog data */
> const uint64_t inputs_read = vs_prog_data->inputs_read;
> + const uint64_t double_inputs_read = vs_prog_data->double_inputs_read;
> assert((inputs_read & ((1 << VERT_ATTRIB_GENERIC0) - 1)) == 0);
> const uint32_t elements = inputs_read >> VERT_ATTRIB_GENERIC0;
> + const uint32_t elements_double = double_inputs_read >>
> VERT_ATTRIB_GENERIC0;
>
> #if GEN_GEN >= 8
> /* On BDW+, we only need to allocate space for base ids. Setting up
> @@ -83,13 +93,16 @@ emit_vertex_input(struct anv_pipeline *pipeline,
> vs_prog_data->uses_baseinstance;
> #endif
>
> - uint32_t elem_count = __builtin_popcount(elements) + needs_svgs_elem;
> - if (elem_count == 0)
> + uint32_t elem_count = __builtin_popcount(elements) -
> + __builtin_popcount(elements_double) / 2;
> +
> + uint32_t total_elems = elem_count + needs_svgs_elem;
> + if (total_elems == 0)
> return;
>
> uint32_t *p;
>
> - const uint32_t num_dwords = 1 + elem_count * 2;
> + const uint32_t num_dwords = 1 + total_elems * 2;
> p = anv_batch_emitn(&pipeline->batch, num_dwords,
> GENX(3DSTATE_VERTEX_ELEMENTS));
> memset(p + 1, 0, (num_dwords - 1) * 4);
> @@ -107,7 +120,10 @@ emit_vertex_input(struct anv_pipeline *pipeline,
> if ((elements & (1 << desc->location)) == 0)
> continue; /* Binding unused */
>
> - uint32_t slot = __builtin_popcount(elements & ((1 <<
> desc->location) - 1));
> + uint32_t slot =
> + __builtin_popcount(elements & ((1 << desc->location) - 1)) -
> + DIV_ROUND_UP(__builtin_popcount(elements_double &
> + ((1 << desc->location) -1)), 2);
>
> struct GENX(VERTEX_ELEMENT_STATE) element = {
> .VertexBufferIndex = desc->binding,
> @@ -137,7 +153,7 @@ emit_vertex_input(struct anv_pipeline *pipeline,
> #endif
> }
>
> - const uint32_t id_slot = __builtin_popcount(elements);
> + const uint32_t id_slot = elem_count;
> if (needs_svgs_elem) {
> /* From the Broadwell PRM for the 3D_Vertex_Component_Control enum:
> * "Within a VERTEX_ELEMENT_STATE structure, if a Component
> diff --git a/src/mesa/drivers/dri/i965/brw_draw_upload.c
> b/src/mesa/drivers/dri/i965/brw_draw_upload.c
> index 57815645924..b7527f2cd9b 100644
> --- a/src/mesa/drivers/dri/i965/brw_draw_upload.c
> +++ b/src/mesa/drivers/dri/i965/brw_draw_upload.c
> @@ -481,11 +481,16 @@ brw_prepare_vertices(struct brw_context *brw)
> /* Accumulate the list of enabled arrays. */
> brw->vb.nr_enabled = 0;
> while (vs_inputs) {
> - GLuint index = ffsll(vs_inputs) - 1;
> + GLuint first = ffsll(vs_inputs) - 1;
> + GLuint index =
> + first - DIV_ROUND_UP(_mesa_bitcount_64(vs_prog_data->double_inputs_read
> &
> + BITFIELD64_MASK(first)),
> 2);
> struct brw_vertex_element *input = &brw->vb.inputs[index];
> input->is_dual_slot = brw->gen >= 8 &&
> - (vs_prog_data->double_inputs_read & BITFIELD64_BIT(index)) != 0;
> - vs_inputs &= ~BITFIELD64_BIT(index);
> + (vs_prog_data->double_inputs_read & BITFIELD64_BIT(first)) != 0;
> + vs_inputs &= ~BITFIELD64_BIT(first);
> + if (input->is_dual_slot)
> + vs_inputs &= ~BITFIELD64_BIT(first + 1);
> brw->vb.enabled[brw->vb.nr_enabled++] = input;
> }
>
> diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp
> b/src/mesa/drivers/dri/i965/brw_fs.cpp
> index c8a069386dd..03f9c24d151 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
> @@ -492,19 +492,6 @@ type_size_scalar(const struct glsl_type *type)
> return 0;
> }
>
> -/* Attribute arrays are loaded as one vec4 per element (or matrix column),
> - * except for double-precision types, which are loaded as one dvec4.
> - */
> -extern "C" int
> -type_size_vs_input(const struct glsl_type *type)
> -{
> - if (type->is_double()) {
> - return type_size_dvec4(type);
> - } else {
> - return type_size_vec4(type);
> - }
> -}
> -
> /**
> * Create a MOV to read the timestamp register.
> *
> diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
> b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
> index 14415bd5c7a..5c356d61ff4 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
> @@ -36,8 +36,7 @@ fs_reg *
> fs_visitor::emit_vs_system_value(int location)
> {
> fs_reg *reg = new(this->mem_ctx)
> - fs_reg(ATTR, 4 * (_mesa_bitcount_64(nir->info->inputs_read) +
> - _mesa_bitcount_64(nir->info->
> double_inputs_read)),
> + fs_reg(ATTR, 4 * _mesa_bitcount_64(nir->info->inputs_read),
> BRW_REGISTER_TYPE_D);
> struct brw_vs_prog_data *vs_prog_data = brw_vs_prog_data(prog_data);
>
> diff --git a/src/mesa/drivers/dri/i965/brw_nir.c
> b/src/mesa/drivers/dri/i965/brw_nir.c
> index 6f37e97a86f..e65a1786e2b 100644
> --- a/src/mesa/drivers/dri/i965/brw_nir.c
> +++ b/src/mesa/drivers/dri/i965/brw_nir.c
> @@ -113,9 +113,7 @@ remap_vs_attrs(nir_block *block, shader_info *nir_info)
> int attr = intrin->const_index[0];
> int slot = _mesa_bitcount_64(nir_info->inputs_read &
> BITFIELD64_MASK(attr));
> - int dslot = _mesa_bitcount_64(nir_info->double_inputs_read &
> - BITFIELD64_MASK(attr));
> - intrin->const_index[0] = 4 * (slot + dslot);
> + intrin->const_index[0] = 4 * slot;
> }
> }
> return true;
> @@ -204,7 +202,7 @@ brw_nir_lower_vs_inputs(nir_shader *nir,
> * loaded as one vec4 or dvec4 per element (or matrix column),
> depending on
> * whether it is a double-precision type or not.
> */
> - nir_lower_io(nir, nir_var_shader_in, type_size_vs_input, 0);
> + nir_lower_io(nir, nir_var_shader_in, type_size_vec4, 0);
>
> /* This pass needs actual constants */
> nir_opt_constant_folding(nir);
> diff --git a/src/mesa/drivers/dri/i965/brw_nir.h
> b/src/mesa/drivers/dri/i965/brw_nir.h
> index 8cfb6c1be68..012343b6b82 100644
> --- a/src/mesa/drivers/dri/i965/brw_nir.h
> +++ b/src/mesa/drivers/dri/i965/brw_nir.h
> @@ -34,7 +34,6 @@ extern "C" {
> int type_size_scalar(const struct glsl_type *type);
> int type_size_vec4(const struct glsl_type *type);
> int type_size_dvec4(const struct glsl_type *type);
> -int type_size_vs_input(const struct glsl_type *type);
>
> static inline int
> type_size_scalar_bytes(const struct glsl_type *type)
> diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp
> b/src/mesa/drivers/dri/i965/brw_vec4.cpp
> index f096ce9e020..f84729dc636 100644
> --- a/src/mesa/drivers/dri/i965/brw_vec4.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp
> @@ -2737,7 +2737,7 @@ brw_compile_vs(const struct brw_compiler *compiler,
> void *log_data,
> ((1 << shader->info->cull_distance_array_size) - 1) <<
> shader->info->clip_distance_array_size;
>
> - unsigned nr_attributes = _mesa_bitcount_64(prog_data->inputs_read);
> + unsigned nr_attribute_slots = _mesa_bitcount_64(prog_data->
> inputs_read);
>
> /* gl_VertexID and gl_InstanceID are system values, but arrive via an
> * incoming vertex attribute. So, add an extra slot.
> @@ -2747,18 +2747,17 @@ brw_compile_vs(const struct brw_compiler
> *compiler, void *log_data,
> BITFIELD64_BIT(SYSTEM_VALUE_BASE_INSTANCE) |
> BITFIELD64_BIT(SYSTEM_VALUE_VERTEX_ID_ZERO_BASE) |
> BITFIELD64_BIT(SYSTEM_VALUE_INSTANCE_ID))) {
> - nr_attributes++;
> + nr_attribute_slots++;
> }
>
> /* gl_DrawID has its very own vec4 */
> if (shader->info->system_values_read &
> BITFIELD64_BIT(SYSTEM_VALUE_DRAW_ID)) {
> - nr_attributes++;
> + nr_attribute_slots++;
> }
>
> - unsigned nr_attribute_slots =
> - nr_attributes +
> - _mesa_bitcount_64(shader->info->double_inputs_read);
> + unsigned nr_attributes = nr_attribute_slots -
> + DIV_ROUND_UP(_mesa_bitcount_64(shader->info->double_inputs_read),
> 2);
>
> /* The 3DSTATE_VS documentation lists the lower bound on "Vertex URB
> Entry
> * Read Length" as 1 in vec4 mode, and 0 in SIMD8 mode. Empirically,
> in
> --
> 2.11.0
>
> _______________________________________________
> mesa-dev mailing list
> mesa-dev at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.freedesktop.org/archives/mesa-dev/attachments/20170105/46e907fd/attachment-0001.html>
More information about the mesa-dev
mailing list