<div dir="ltr"><div dir="auto"><div><div class="gmail_extra"><div class="gmail_quote">I made a few pretty trivial comments. With those addressed, </div><div class="gmail_quote">Reviewed-by: Jason Ekstrand <<a href="mailto:jason@jlekstrand.net">jason@jlekstrand.net</a>> </div><div class="gmail_quote" dir="auto"> </div><div class="gmail_quote">On Dec 16, 2016 8:55 AM, "Juan A. Suarez Romero" <<a href="mailto:jasuarez@igalia.com" target="_blank">jasuarez@igalia.com</a>> wrote: <blockquote class="m_6457694654742999625quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">So far, input_reads was a bitmap tracking which vertex input locations were being used. In OpenGL, an attribute bigger than a vec4 (like a dvec3 or dvec4) consumes just one location, any other small attribute. So we mark the proper bit in inputs_read, and also the same bit in double_inputs_read if the attribute is a dvec3/dvec4. But in Vulkan, this is slightly different: a dvec3/dvec4 attribute consumes two locations, not just one. And hence two bits would be marked in inputs_read for the same vertex input attribute. To avoid handling two different situations in NIR, we just choose the latest one: in OpenGL, when creating NIR from GLSL/IR, any dvec3/dvec4 vertex input attribute is marked with two bits in the inputs_read bitmap (and also in the double_inputs_read), and following attributes are adjusted accordingly. As example, if in our GLSL/IR shader we have three attributes: layout(location = 0) vec3 attr0; layout(location = 1) dvec4 attr1; layout(location = 2) dvec3 attr2; then in our NIR shader we put attr0 in location 0, attr1 in locations 1 and 2, and attr2 in location 3. </blockquote></div></div></div><div dir="auto"> </div><div dir="auto">attr2 goes in locations 3 *and* 4, correct?</div><div dir="auto"> </div><div dir="auto"><div class="gmail_extra"><div class="gmail_quote"><blockquote class="m_6457694654742999625quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"> Checking carefully, basically we are using slots rather than locations in NIR. When emitting the vertices, we do a inverse map to know the corresponding location for each slot. v2 (Jason): - use two slots from inputs_read for dvec3/dvec4 NIR from GLSL/IR. --- src/compiler/glsl/glsl_to_nir.cpp | 28 +++++++++++++ src/compiler/nir/nir_gather_info.c | 48 ++++++++++----------- src/intel/vulkan/genX_pipeline.c | 63 +++++++++++++++++----------- src/mesa/drivers/dri/i965/brw_draw_upload.c | 11 +++-- src/mesa/drivers/dri/i965/brw_fs.cpp | 13 ------ src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 3 +- src/mesa/drivers/dri/i965/brw_nir.c | 6 +-- src/mesa/drivers/dri/i965/brw_nir.h | 1 - src/mesa/drivers/dri/i965/brw_vec4.cpp | 11 +++-- 9 files changed, 106 insertions(+), 78 deletions(-) diff --git a/src/compiler/glsl/glsl_to_nir.cpp b/src/compiler/glsl/glsl_to_nir.cpp index 4debc37..0814dad 100644 --- a/src/compiler/glsl/glsl_to_nir.cpp +++ b/src/compiler/glsl/glsl_to_nir.cpp @@ -129,6 +129,19 @@ private: } /* end of anonymous namespace */ +static void +nir_remap_attributes(nir_shader *shader) +{ + nir_foreach_variable(var, &shader->inputs) { + var->data.location += _mesa_bitcount_64(shader->info->double_inputs_read & + BITFIELD64_MASK(var->data.location)); + } + + /* Once the remap is done, reset double_inputs_read, so later it will have + * which location/slots are doubles */ + shader->info->double_inputs_read = 0; +} + nir_shader * glsl_to_nir(const struct gl_shader_program *shader_prog, gl_shader_stage stage, @@ -146,6 +159,13 @@ glsl_to_nir(const struct gl_shader_program *shader_prog, nir_lower_constant_initializers(shader, (nir_variable_mode)~0); + /* Remap the locations to slots so those requiring two slots will occupy + * two locations. For instance, if we have in the IR code a dvec3 attr0 in + * location 0 and vec4 attr1 in location 1, in NIR attr0 will use + * locations/slots 0 and 1, and attr1 will use location/slot 2 */ + if (shader->stage == MESA_SHADER_VERTEX) + nir_remap_attributes(shader); + shader->info->name = ralloc_asprintf(shader, "GLSL%d", shader_prog->Name); if (shader_prog->Label) shader->info->label = ralloc_strdup(shader, shader_prog->Label); @@ -315,6 +335,14 @@ nir_visitor::visit(ir_variable *ir) } else { var->data.mode = nir_var_shader_in; } + + /* Mark all the locations that require two slots */ + if (glsl_type_is_dual_slot(glsl_without_array(var->type))) { + for (uint i = 0; i < glsl_count_attribute_slots(var->type, true); i++) { + uint64_t bitfield = BITFIELD64_BIT(var->data.location + i); + shader->info->double_inputs_read |= bitfield; + } + } break; case ir_var_shader_out: diff --git a/src/compiler/nir/nir_gather_info.c b/src/compiler/nir/nir_gather_info.c index 07c9949..35a1ce4 100644 --- a/src/compiler/nir/nir_gather_info.c +++ b/src/compiler/nir/nir_gather_info.c @@ -53,11 +53,6 @@ set_io_mask(nir_shader *shader, nir_variable *var, int offset, int len) else shader->info->inputs_read |= bitfield; - /* double inputs read is only for vertex inputs */ - if (shader->stage == MESA_SHADER_VERTEX && - glsl_type_is_dual_slot(glsl_without_array(var->type))) - shader->info->double_inputs_read |= bitfield; - if (shader->stage == MESA_SHADER_FRAGMENT) { shader->info->fs.uses_sample_qualifier |= var->data.sample; } @@ -83,26 +78,21 @@ static void mark_whole_variable(nir_shader *shader, nir_variable *var) { const struct glsl_type *type = var->type; - bool is_vertex_input = false; if (nir_is_per_vertex_io(var, shader->stage)) { assert(glsl_type_is_array(type)); type = glsl_get_array_element(type); } - if (shader->stage == MESA_SHADER_VERTEX && - var->data.mode == nir_var_shader_in) - is_vertex_input = true; - const unsigned slots = var->data.compact ? DIV_ROUND_UP(glsl_get_length(type), 4) - : glsl_count_attribute_slots(type, is_vertex_input); + : glsl_count_attribute_slots(type, false); set_io_mask(shader, var, 0, slots); } static unsigned -get_io_offset(nir_deref_var *deref, bool is_vertex_input) +get_io_offset(nir_deref_var *deref) { unsigned offset = 0; @@ -117,7 +107,7 @@ get_io_offset(nir_deref_var *deref, bool is_vertex_input) return -1; } - offset += glsl_count_attribute_slots(tail->type, is_vertex_input) * + offset += glsl_count_attribute_slots(tail->type, false) * deref_array->base_offset; } /* TODO: we can get the offset for structs here see nir_lower_io() */ @@ -163,12 +153,7 @@ try_mask_partial_io(nir_shader *shader, nir_deref_var *deref) return false; } - bool is_vertex_input = false; - if (shader->stage == MESA_SHADER_VERTEX && - var->data.mode == nir_var_shader_in) - is_vertex_input = true; - - unsigned offset = get_io_offset(deref, is_vertex_input); + unsigned offset = get_io_offset(deref); if (offset == -1) return false; @@ -184,8 +169,7 @@ try_mask_partial_io(nir_shader *shader, nir_deref_var *deref) } /* double element width for double types that takes two slots */ - if (!is_vertex_input && - glsl_type_is_dual_slot(glsl_without_array(type))) { + if (glsl_type_is_dual_slot(glsl_without_array(type))) { elem_width *= 2; } @@ -220,13 +204,27 @@ gather_intrinsic_info(nir_intrinsic_instr *instr, nir_shader *shader) case nir_intrinsic_interp_var_at_sample: case nir_intrinsic_interp_var_at_offset: case nir_intrinsic_load_var: - case nir_intrinsic_store_var: - if (instr->variables[0]->var->data.mode == nir_var_shader_in || - instr->variables[0]->var->data.mode == nir_var_shader_out) { + case nir_intrinsic_store_var: { + nir_variable *var = instr->variables[0]->var; + + if (var->data.mode == nir_var_shader_in || + var->data.mode == nir_var_shader_out) { if (!try_mask_partial_io(shader, instr->variables[0])) - mark_whole_variable(shader, instr->variables[0]->var); + mark_whole_variable(shader, var); + + /* We need to track which input_reads bits correspond to a + * dvec3/dvec4 input attribute */ + if (shader->stage == MESA_SHADER_VERTEX && + var->data.mode == nir_var_shader_in && + glsl_type_is_dual_slot(glsl_without_array(var->type))) { + for (uint i = 0; i < glsl_count_attribute_slots(var->type, false); i++) { + int idx = var->data.location + i; + shader->info->double_inputs_read |= BITFIELD64_BIT(idx); + } + } } break; + } case nir_intrinsic_load_draw_id: case nir_intrinsic_load_front_face: diff --git a/src/intel/vulkan/genX_pipeline.c b/src/intel/vulkan/genX_pipeline.c index 845d020..7b94959 100644 --- a/src/intel/vulkan/genX_pipeline.c +++ b/src/intel/vulkan/genX_pipeline.c @@ -33,26 +33,33 @@ static uint32_t vertex_element_comp_control(enum isl_format format, unsigned comp) { - uint8_t bits; switch (comp) { - case 0: bits = isl_format_layouts[format].channels.r.bits; break; - case 1: bits = isl_format_layouts[format].channels.g.bits; break; - case 2: bits = isl_format_layouts[format].channels.b.bits; break; - case 3: bits = isl_format_layouts[format].channels.a.bits; break; - default: unreachable("Invalid component"); - } - - if (bits) { - return VFCOMP_STORE_SRC; - } else if (comp < 3) { - return VFCOMP_STORE_0; - } else if (isl_format_layouts[format].channels.r.type == ISL_UINT || - isl_format_layouts[format].channels.r.type == ISL_SINT) { - assert(comp == 3); - return VFCOMP_STORE_1_INT; - } else { - assert(comp == 3); - return VFCOMP_STORE_1_FP; + case 0: + return isl_format_layouts[format].channels.r.bits ? + VFCOMP_STORE_SRC : VFCOMP_STORE_0; + case 1: + return isl_format_layouts[format].channels.g.bits ? + VFCOMP_STORE_SRC : VFCOMP_STORE_0; + case 2: + return isl_format_layouts[format].channels.b.bits ? + VFCOMP_STORE_SRC : ((isl_format_layouts[format].channels.r.type == ISL_RAW) ? + VFCOMP_NOSTORE : VFCOMP_STORE_0); </blockquote></div></div></div><div dir="auto"> </div><div dir="auto">Given all the line wrapping, I think it would be clearer to just have an if ladder here</div><div dir="auto"> </div><div dir="auto"><div class="gmail_extra"><div class="gmail_quote"><blockquote class="m_6457694654742999625quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"> + case 3: + if (isl_format_layouts[format].channels.a.bits) + return VFCOMP_STORE_SRC; + else </blockquote></div></div></div><div dir="auto"> </div><div dir="auto">Please use braces when one side of the if/else is multiple lines.</div><div dir="auto"> </div><div dir="auto"><div class="gmail_extra"><div class="gmail_quote"><blockquote class="m_6457694654742999625quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"> + switch (isl_format_layouts[format].channels.r.type) { + case ISL_RAW: + return isl_format_layouts[format].channels.b.bits ? + VFCOMP_STORE_0 : VFCOMP_NOSTORE; </blockquote></div></div></div><div dir="auto"> </div><div dir="auto">This seems a bit odd. Mind explaining what's going on here?</div><div dir="auto"> </div><div dir="auto"><div class="gmail_extra"><div class="gmail_quote"><blockquote class="m_6457694654742999625quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"> + case ISL_UINT: + case ISL_SINT: + return VFCOMP_STORE_1_INT; + default: + return VFCOMP_STORE_1_FP; + } + default: + unreachable("Invalid component"); } } @@ -64,8 +71,10 @@ emit_vertex_input(struct anv_pipeline *pipeline, /* Pull inputs_read out of the VS prog data */ const uint64_t inputs_read = vs_prog_data->inputs_read; + const uint64_t double_inputs_read = vs_prog_data->double_inputs_read; assert((inputs_read & ((1 << VERT_ATTRIB_GENERIC0) - 1)) == 0); const uint32_t elements = inputs_read >> VERT_ATTRIB_GENERIC0; + const uint32_t elements_double = double_inputs_read >> VERT_ATTRIB_GENERIC0; #if GEN_GEN >= 8 /* On BDW+, we only need to allocate space for base ids. Setting up @@ -83,13 +92,16 @@ emit_vertex_input(struct anv_pipeline *pipeline, vs_prog_data->uses_baseinstance; #endif - uint32_t elem_count = __builtin_popcount(elements) + needs_svgs_elem; - if (elem_count == 0) + uint32_t elem_count = __builtin_popcount(elements) - + DIV_ROUND_UP(__builtin_popcount(elements_double), 2); </blockquote></div></div></div><div dir="auto"> </div><div dir="auto">This had better be divisible by 2...</div><div dir="auto"> </div><div dir="auto"><div class="gmail_extra"><div class="gmail_quote"><blockquote class="m_6457694654742999625quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"> + + uint32_t total_elems = elem_count + needs_svgs_elem; + if (total_elems == 0) return; uint32_t *p; - const uint32_t num_dwords = 1 + elem_count * 2; + const uint32_t num_dwords = 1 + total_elems * 2; p = anv_batch_emitn(&pipeline->batch, num_dwords, GENX(3DSTATE_VERTEX_ELEMENTS)); memset(p + 1, 0, (num_dwords - 1) * 4); @@ -107,7 +119,10 @@ emit_vertex_input(struct anv_pipeline *pipeline, if ((elements & (1 << desc->location)) == 0) continue; /* Binding unused */ - uint32_t slot = __builtin_popcount(elements & ((1 << desc->location) - 1)); + uint32_t slot = + __builtin_popcount(elements & ((1 << desc->location) - 1)) - + DIV_ROUND_UP(__builtin_popcount(elements_double & + ((1 << desc->location) -1)), 2); struct GENX(VERTEX_ELEMENT_STATE) element = { .VertexBufferIndex = desc->binding, @@ -137,7 +152,7 @@ emit_vertex_input(struct anv_pipeline *pipeline, #endif } - const uint32_t id_slot = __builtin_popcount(elements); + const uint32_t id_slot = elem_count; if (needs_svgs_elem) { /* From the Broadwell PRM for the 3D_Vertex_Component_Control enum: * "Within a VERTEX_ELEMENT_STATE structure, if a Component diff --git a/src/mesa/drivers/dri/i965/brw_draw_upload.c b/src/mesa/drivers/dri/i965/brw_draw_upload.c index b138cb7..c3655d8 100644 --- a/src/mesa/drivers/dri/i965/brw_draw_upload.c +++ b/src/mesa/drivers/dri/i965/brw_draw_upload.c @@ -472,11 +472,16 @@ brw_prepare_vertices(struct brw_context *brw) /* Accumulate the list of enabled arrays. */ brw->vb.nr_enabled = 0; while (vs_inputs) { - GLuint index = ffsll(vs_inputs) - 1; + GLuint first = ffsll(vs_inputs) - 1; + GLuint index = + first - DIV_ROUND_UP(_mesa_bitcount_64(vs_prog_data->double_inputs_read & + BITFIELD64_MASK(first)), 2); struct brw_vertex_element *input = &brw->vb.inputs[index]; input->is_dual_slot = brw->gen >= 8 && - (vs_prog_data->double_inputs_read & BITFIELD64_BIT(index)) != 0; - vs_inputs &= ~BITFIELD64_BIT(index); + (vs_prog_data->double_inputs_read & BITFIELD64_BIT(first)) != 0; + vs_inputs &= ~BITFIELD64_BIT(first); + if (input->is_dual_slot) + vs_inputs &= ~BITFIELD64_BIT(first + 1); brw->vb.enabled[brw->vb.nr_enabled++] = input; } diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index c218f56..b383d98 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -492,19 +492,6 @@ type_size_scalar(const struct glsl_type *type) return 0; } -/* Attribute arrays are loaded as one vec4 per element (or matrix column), - * except for double-precision types, which are loaded as one dvec4. - */ -extern "C" int -type_size_vs_input(const struct glsl_type *type) -{ - if (type->is_double()) { - return type_size_dvec4(type); - } else { - return type_size_vec4(type); - } -} - /** * Create a MOV to read the timestamp register. * diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp index 14415bd..5c356d6 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp @@ -36,8 +36,7 @@ fs_reg * fs_visitor::emit_vs_system_value(int location) { fs_reg *reg = new(this->mem_ctx) - fs_reg(ATTR, 4 * (_mesa_bitcount_64(nir->info->inputs_read) + - _mesa_bitcount_64(nir->info->double_inputs_read)), + fs_reg(ATTR, 4 * _mesa_bitcount_64(nir->info->inputs_read), BRW_REGISTER_TYPE_D); struct brw_vs_prog_data *vs_prog_data = brw_vs_prog_data(prog_data); diff --git a/src/mesa/drivers/dri/i965/brw_nir.c b/src/mesa/drivers/dri/i965/brw_nir.c index 763e3ec..24334c8 100644 --- a/src/mesa/drivers/dri/i965/brw_nir.c +++ b/src/mesa/drivers/dri/i965/brw_nir.c @@ -113,9 +113,7 @@ remap_vs_attrs(nir_block *block, shader_info *nir_info) int attr = intrin->const_index[0]; int slot = _mesa_bitcount_64(nir_info->inputs_read & BITFIELD64_MASK(attr)); - int dslot = _mesa_bitcount_64(nir_info->double_inputs_read & - BITFIELD64_MASK(attr)); - intrin->const_index[0] = 4 * (slot + dslot); + intrin->const_index[0] = 4 * slot; } } return true; @@ -204,7 +202,7 @@ brw_nir_lower_vs_inputs(nir_shader *nir, * loaded as one vec4 or dvec4 per element (or matrix column), depending on * whether it is a double-precision type or not. */ - nir_lower_io(nir, nir_var_shader_in, type_size_vs_input, 0); + nir_lower_io(nir, nir_var_shader_in, type_size_vec4, 0); /* This pass needs actual constants */ nir_opt_constant_folding(nir); diff --git a/src/mesa/drivers/dri/i965/brw_nir.h b/src/mesa/drivers/dri/i965/brw_nir.h index 3c774d0..04c7ef8 100644 --- a/src/mesa/drivers/dri/i965/brw_nir.h +++ b/src/mesa/drivers/dri/i965/brw_nir.h @@ -34,7 +34,6 @@ extern "C" { int type_size_scalar(const struct glsl_type *type); int type_size_vec4(const struct glsl_type *type); int type_size_dvec4(const struct glsl_type *type); -int type_size_vs_input(const struct glsl_type *type); static inline int type_size_scalar_bytes(const struct glsl_type *type) diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp b/src/mesa/drivers/dri/i965/brw_vec4.cpp index b9e592f..852e88f 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp @@ -2127,7 +2127,7 @@ brw_compile_vs(const struct brw_compiler *compiler, void *log_data, ((1 << shader->info->cull_distance_array_size) - 1) << shader->info->clip_distance_array_size; - unsigned nr_attributes = _mesa_bitcount_64(prog_data->inputs_read); + unsigned nr_attribute_slots = _mesa_bitcount_64(prog_data->inputs_read); /* gl_VertexID and gl_InstanceID are system values, but arrive via an * incoming vertex attribute. So, add an extra slot. @@ -2137,18 +2137,17 @@ brw_compile_vs(const struct brw_compiler *compiler, void *log_data, BITFIELD64_BIT(SYSTEM_VALUE_BASE_INSTANCE) | BITFIELD64_BIT(SYSTEM_VALUE_VERTEX_ID_ZERO_BASE) | BITFIELD64_BIT(SYSTEM_VALUE_INSTANCE_ID))) { - nr_attributes++; + nr_attribute_slots++; } /* gl_DrawID has its very own vec4 */ if (shader->info->system_values_read & BITFIELD64_BIT(SYSTEM_VALUE_DRAW_ID)) { - nr_attributes++; + nr_attribute_slots++; } - unsigned nr_attribute_slots = - nr_attributes + - _mesa_bitcount_64(shader->info->double_inputs_read); + unsigned nr_attributes = nr_attribute_slots - + DIV_ROUND_UP(_mesa_bitcount_64(shader->info->double_inputs_read), 2); /* The 3DSTATE_VS documentation lists the lower bound on "Vertex URB Entry * Read Length" as 1 in vec4 mode, and 0 in SIMD8 mode. Empirically, in -- 2.9.3 _______________________________________________ mesa-dev mailing list <a href="mailto:mesa-dev@lists.freedesktop.org" target="_blank">mesa-dev@lists.freedesktop.org</a> <a href="https://lists.freedesktop.org/mailman/listinfo/mesa-dev" rel="noreferrer" target="_blank">https://lists.freedesktop.org/mailman/listinfo/mesa-dev</a> </blockquote></div> </div></div></div> </div>