[Mesa-dev] [PATCH 19/22] anv/nir: add support for dvec3/4 consuming two locations
Jason Ekstrand
jason at jlekstrand.net
Fri Dec 2 06:22:06 UTC 2016
+Ken
On Thu, Dec 1, 2016 at 10:17 PM, Jason Ekstrand <jason at jlekstrand.net>
wrote:
> I'm not sure how I feel about this one. It seems like it would almost be
> easier to just pick one convention or the other for NIR and adjust one of
> the drivers accordingly. I don't know that I have a huge preference which
> convention we choose. I guess the Vulkan convention matches our hardware a
> bit better. In either case, converting from one to the other should be a
> simple matter of building a remap table or a creative use of popcount.
>
> On Fri, Nov 25, 2016 at 12:52 AM, Juan A. Suarez Romero <
> jasuarez at igalia.com> wrote:
>
>> One difference between OpenGL and Vulkan regarding 64-bit vertex
>> attribute types is that dvec3 and dvec4 consumes just one location in
>> OpenGL, while in Vulkan it consumes two locations.
>>
>> Thus, in OpenGL for each dvec3/dvec4 vertex attrib we mark just one bit
>> in our internal inputs_read bitmap (and also the corresponding bit in
>> double_inputs_read bitmap) while in Vulkan we mark two consecutive bits
>> in both bitmaps.
>>
>> This is handled with a nir option called "dvec3_consumes_two_locations",
>> which is set to True for Vulkan code. And all the computation regarding
>> emitting vertices as well as the mapping between attributes and physical
>> registers use this option to correctly do the work.
>> ---
>> src/amd/vulkan/radv_pipeline.c | 1 +
>> src/compiler/nir/nir.h | 5 +++
>> src/compiler/nir/nir_gather_info.c | 6 +--
>> src/gallium/drivers/freedreno/ir3/ir3_nir.c | 1 +
>> src/intel/vulkan/anv_device.c | 2 +-
>> src/intel/vulkan/genX_pipeline.c | 62
>> +++++++++++++++++-----------
>> src/mesa/drivers/dri/i965/brw_compiler.c | 23 ++++++++++-
>> src/mesa/drivers/dri/i965/brw_compiler.h | 2 +-
>> src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 14 +++++--
>> src/mesa/drivers/dri/i965/brw_nir.c | 18 +++++---
>> src/mesa/drivers/dri/i965/brw_vec4.cpp | 13 ++++--
>> src/mesa/drivers/dri/i965/intel_screen.c | 3 +-
>> 12 files changed, 105 insertions(+), 45 deletions(-)
>>
>> diff --git a/src/amd/vulkan/radv_pipeline.c
>> b/src/amd/vulkan/radv_pipeline.c
>> index ee5d812..90d4650 100644
>> --- a/src/amd/vulkan/radv_pipeline.c
>> +++ b/src/amd/vulkan/radv_pipeline.c
>> @@ -59,6 +59,7 @@ static const struct nir_shader_compiler_options
>> nir_options = {
>> .lower_unpack_unorm_4x8 = true,
>> .lower_extract_byte = true,
>> .lower_extract_word = true,
>> + .dvec3_consumes_two_locations = true,
>> };
>>
>> VkResult radv_CreateShaderModule(
>> diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h
>> index 1679d89..0fc8f39 100644
>> --- a/src/compiler/nir/nir.h
>> +++ b/src/compiler/nir/nir.h
>> @@ -1794,6 +1794,11 @@ typedef struct nir_shader_compiler_options {
>> * information must be inferred from the list of input nir_variables.
>> */
>> bool use_interpolated_input_intrinsics;
>> +
>> + /**
>> + * In Vulkan, a dvec3/dvec4 consumes two locations instead just one.
>> + */
>> + bool dvec3_consumes_two_locations;
>> } nir_shader_compiler_options;
>>
>> typedef struct nir_shader {
>> diff --git a/src/compiler/nir/nir_gather_info.c
>> b/src/compiler/nir/nir_gather_info.c
>> index 07c9949..8c80671 100644
>> --- a/src/compiler/nir/nir_gather_info.c
>> +++ b/src/compiler/nir/nir_gather_info.c
>> @@ -96,7 +96,7 @@ mark_whole_variable(nir_shader *shader, nir_variable
>> *var)
>>
>> const unsigned slots =
>> var->data.compact ? DIV_ROUND_UP(glsl_get_length(type), 4)
>> - : glsl_count_attribute_slots(type,
>> is_vertex_input);
>> + : glsl_count_attribute_slots(type,
>> is_vertex_input && !shader->options->dvec3_consumes_two_locations);
>>
>
> This makes no sense, why are we passing is_vertex_input &&
> !dvec3_consumes_two_locations to an argument labled is_vertex_input?
>
>
>>
>> set_io_mask(shader, var, 0, slots);
>> }
>> @@ -168,7 +168,7 @@ try_mask_partial_io(nir_shader *shader, nir_deref_var
>> *deref)
>> var->data.mode == nir_var_shader_in)
>> is_vertex_input = true;
>>
>> - unsigned offset = get_io_offset(deref, is_vertex_input);
>> + unsigned offset = get_io_offset(deref, is_vertex_input &&
>> !shader->options->dvec3_consumes_two_locations);
>>
>
> Same here
>
>
>> if (offset == -1)
>> return false;
>>
>> @@ -184,7 +184,7 @@ try_mask_partial_io(nir_shader *shader, nir_deref_var
>> *deref)
>> }
>>
>> /* double element width for double types that takes two slots */
>> - if (!is_vertex_input &&
>> + if ((!is_vertex_input || shader->options->dvec3_consumes_two_locations)
>> &&
>> glsl_type_is_dual_slot(glsl_without_array(type))) {
>>
>
> This makes a bit more sense but I'm still not quite seeing it.
>
>
>> elem_width *= 2;
>> }
>> diff --git a/src/gallium/drivers/freedreno/ir3/ir3_nir.c
>> b/src/gallium/drivers/freedreno/ir3/ir3_nir.c
>> index 2d86a52..5c5c9ad 100644
>> --- a/src/gallium/drivers/freedreno/ir3/ir3_nir.c
>> +++ b/src/gallium/drivers/freedreno/ir3/ir3_nir.c
>> @@ -50,6 +50,7 @@ static const nir_shader_compiler_options options = {
>> .vertex_id_zero_based = true,
>> .lower_extract_byte = true,
>> .lower_extract_word = true,
>> + .dvec3_consumes_two_locations = false,
>> };
>>
>> struct nir_shader *
>> diff --git a/src/intel/vulkan/anv_device.c b/src/intel/vulkan/anv_device.
>> c
>> index 2c8ac49..725848f 100644
>> --- a/src/intel/vulkan/anv_device.c
>> +++ b/src/intel/vulkan/anv_device.c
>> @@ -167,7 +167,7 @@ anv_physical_device_init(struct anv_physical_device
>> *device,
>>
>> brw_process_intel_debug_variable();
>>
>> - device->compiler = brw_compiler_create(NULL, &device->info);
>> + device->compiler = brw_compiler_create(NULL, &device->info, true);
>> if (device->compiler == NULL) {
>> result = vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
>> goto fail;
>> diff --git a/src/intel/vulkan/genX_pipeline.c
>> b/src/intel/vulkan/genX_pipeline.c
>> index cb164ad..97c40b8 100644
>> --- a/src/intel/vulkan/genX_pipeline.c
>> +++ b/src/intel/vulkan/genX_pipeline.c
>> @@ -33,26 +33,33 @@
>> static uint32_t
>> vertex_element_comp_control(enum isl_format format, unsigned comp)
>> {
>> - uint8_t bits;
>> switch (comp) {
>> - case 0: bits = isl_format_layouts[format].channels.r.bits; break;
>> - case 1: bits = isl_format_layouts[format].channels.g.bits; break;
>> - case 2: bits = isl_format_layouts[format].channels.b.bits; break;
>> - case 3: bits = isl_format_layouts[format].channels.a.bits; break;
>> - default: unreachable("Invalid component");
>> - }
>> -
>> - if (bits) {
>> - return VFCOMP_STORE_SRC;
>> - } else if (comp < 3) {
>> - return VFCOMP_STORE_0;
>> - } else if (isl_format_layouts[format].channels.r.type == ISL_UINT ||
>> - isl_format_layouts[format].channels.r.type == ISL_SINT) {
>> - assert(comp == 3);
>> - return VFCOMP_STORE_1_INT;
>> - } else {
>> - assert(comp == 3);
>> - return VFCOMP_STORE_1_FP;
>> + case 0:
>> + return isl_format_layouts[format].channels.r.bits ?
>> + VFCOMP_STORE_SRC : VFCOMP_STORE_0;
>> + case 1:
>> + return isl_format_layouts[format].channels.g.bits ?
>> + VFCOMP_STORE_SRC : VFCOMP_STORE_0;
>> + case 2:
>> + return isl_format_layouts[format].channels.b.bits ?
>> + VFCOMP_STORE_SRC : ((isl_format_layouts[format].channels.r.type
>> == ISL_RAW) ?
>> + VFCOMP_NOSTORE : VFCOMP_STORE_0);
>> + case 3:
>> + if (isl_format_layouts[format].channels.a.bits)
>> + return VFCOMP_STORE_SRC;
>> + else
>> + switch (isl_format_layouts[format].channels.r.type) {
>> + case ISL_RAW:
>> + return isl_format_layouts[format].channels.b.bits ?
>> + VFCOMP_STORE_0 : VFCOMP_NOSTORE;
>> + case ISL_UINT:
>> + case ISL_SINT:
>> + return VFCOMP_STORE_1_INT;
>> + default:
>> + return VFCOMP_STORE_1_FP;
>> + }
>> + default:
>> + unreachable("Invalid component");
>> }
>> }
>>
>> @@ -64,8 +71,10 @@ emit_vertex_input(struct anv_pipeline *pipeline,
>>
>> /* Pull inputs_read out of the VS prog data */
>> const uint64_t inputs_read = vs_prog_data->inputs_read;
>> + const uint64_t double_inputs_read = vs_prog_data->double_inputs_read;
>> assert((inputs_read & ((1 << VERT_ATTRIB_GENERIC0) - 1)) == 0);
>> const uint32_t elements = inputs_read >> VERT_ATTRIB_GENERIC0;
>> + const uint32_t elements_double = double_inputs_read >>
>> VERT_ATTRIB_GENERIC0;
>>
>> #if GEN_GEN >= 8
>> /* On BDW+, we only need to allocate space for base ids. Setting up
>> @@ -83,13 +92,16 @@ emit_vertex_input(struct anv_pipeline *pipeline,
>> vs_prog_data->uses_baseinstance;
>> #endif
>>
>> - uint32_t elem_count = __builtin_popcount(elements) + needs_svgs_elem;
>> - if (elem_count == 0)
>> + uint32_t elem_count =
>> + __builtin_popcount(elements) - DIV_ROUND_UP(__builtin_popcount(elements_double),
>> 2);
>> +
>> + uint32_t total_elems = elem_count + needs_svgs_elem;
>> + if (total_elems == 0)
>> return;
>>
>> uint32_t *p;
>>
>> - const uint32_t num_dwords = 1 + elem_count * 2;
>> + const uint32_t num_dwords = 1 + total_elems * 2;
>> p = anv_batch_emitn(&pipeline->batch, num_dwords,
>> GENX(3DSTATE_VERTEX_ELEMENTS));
>> memset(p + 1, 0, (num_dwords - 1) * 4);
>> @@ -107,7 +119,9 @@ emit_vertex_input(struct anv_pipeline *pipeline,
>> if ((elements & (1 << desc->location)) == 0)
>> continue; /* Binding unused */
>>
>> - uint32_t slot = __builtin_popcount(elements & ((1 <<
>> desc->location) - 1));
>> + uint32_t slot =
>> + __builtin_popcount(elements & ((1 << desc->location) - 1)) -
>> + DIV_ROUND_UP(__builtin_popcount(elements_double & ((1 <<
>> desc->location) - 1)), 2);
>>
>> struct GENX(VERTEX_ELEMENT_STATE) element = {
>> .VertexBufferIndex = desc->binding,
>> @@ -137,7 +151,7 @@ emit_vertex_input(struct anv_pipeline *pipeline,
>> #endif
>> }
>>
>> - const uint32_t id_slot = __builtin_popcount(elements);
>> + const uint32_t id_slot = elem_count;
>> if (needs_svgs_elem) {
>> /* From the Broadwell PRM for the 3D_Vertex_Component_Control enum:
>> * "Within a VERTEX_ELEMENT_STATE structure, if a Component
>> diff --git a/src/mesa/drivers/dri/i965/brw_compiler.c
>> b/src/mesa/drivers/dri/i965/brw_compiler.c
>> index 1aa72bc..b9eceeb 100644
>> --- a/src/mesa/drivers/dri/i965/brw_compiler.c
>> +++ b/src/mesa/drivers/dri/i965/brw_compiler.c
>> @@ -55,6 +55,22 @@ static const struct nir_shader_compiler_options
>> scalar_nir_options = {
>> .lower_unpack_snorm_4x8 = true,
>> .lower_unpack_unorm_2x16 = true,
>> .lower_unpack_unorm_4x8 = true,
>> + .dvec3_consumes_two_locations = false,
>> +};
>> +
>> +static const struct nir_shader_compiler_options
>> vulkan_scalar_nir_options = {
>> + COMMON_OPTIONS,
>> + .lower_pack_half_2x16 = true,
>> + .lower_pack_snorm_2x16 = true,
>> + .lower_pack_snorm_4x8 = true,
>> + .lower_pack_unorm_2x16 = true,
>> + .lower_pack_unorm_4x8 = true,
>> + .lower_unpack_half_2x16 = true,
>> + .lower_unpack_snorm_2x16 = true,
>> + .lower_unpack_snorm_4x8 = true,
>> + .lower_unpack_unorm_2x16 = true,
>> + .lower_unpack_unorm_4x8 = true,
>> + .dvec3_consumes_two_locations = true,
>> };
>>
>> static const struct nir_shader_compiler_options vector_nir_options = {
>> @@ -75,6 +91,7 @@ static const struct nir_shader_compiler_options
>> vector_nir_options = {
>> .lower_unpack_unorm_2x16 = true,
>> .lower_extract_byte = true,
>> .lower_extract_word = true,
>> + .dvec3_consumes_two_locations = false,
>> };
>>
>> static const struct nir_shader_compiler_options vector_nir_options_gen6
>> = {
>> @@ -92,10 +109,11 @@ static const struct nir_shader_compiler_options
>> vector_nir_options_gen6 = {
>> .lower_unpack_unorm_2x16 = true,
>> .lower_extract_byte = true,
>> .lower_extract_word = true,
>> + .dvec3_consumes_two_locations = false,
>> };
>>
>> struct brw_compiler *
>> -brw_compiler_create(void *mem_ctx, const struct gen_device_info *devinfo)
>> +brw_compiler_create(void *mem_ctx, const struct gen_device_info
>> *devinfo, bool is_vulkan)
>> {
>> struct brw_compiler *compiler = rzalloc(mem_ctx, struct brw_compiler);
>>
>> @@ -138,7 +156,8 @@ brw_compiler_create(void *mem_ctx, const struct
>> gen_device_info *devinfo)
>> compiler->glsl_compiler_options[i].EmitNoIndirectSampler =
>> true;
>>
>> if (is_scalar) {
>> - compiler->glsl_compiler_options[i].NirOptions =
>> &scalar_nir_options;
>> + compiler->glsl_compiler_options[i].NirOptions = is_vulkan ?
>> &vulkan_scalar_nir_options
>> + :
>> &scalar_nir_options;
>> } else {
>> compiler->glsl_compiler_options[i].NirOptions =
>> devinfo->gen < 6 ? &vector_nir_options :
>> &vector_nir_options_gen6;
>> diff --git a/src/mesa/drivers/dri/i965/brw_compiler.h
>> b/src/mesa/drivers/dri/i965/brw_compiler.h
>> index 65a7478..34df343 100644
>> --- a/src/mesa/drivers/dri/i965/brw_compiler.h
>> +++ b/src/mesa/drivers/dri/i965/brw_compiler.h
>> @@ -760,7 +760,7 @@ DEFINE_PROG_DATA_DOWNCAST(sf)
>> /** @} */
>>
>> struct brw_compiler *
>> -brw_compiler_create(void *mem_ctx, const struct gen_device_info
>> *devinfo);
>> +brw_compiler_create(void *mem_ctx, const struct gen_device_info
>> *devinfo, bool is_vulkan);
>>
>> /**
>> * Compile a vertex shader.
>> diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
>> b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
>> index 14415bd..00c5bb4 100644
>> --- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
>> +++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
>> @@ -35,10 +35,16 @@ using namespace brw;
>> fs_reg *
>> fs_visitor::emit_vs_system_value(int location)
>> {
>> - fs_reg *reg = new(this->mem_ctx)
>> - fs_reg(ATTR, 4 * (_mesa_bitcount_64(nir->info->inputs_read) +
>> - _mesa_bitcount_64(nir->info->d
>> ouble_inputs_read)),
>> - BRW_REGISTER_TYPE_D);
>> + fs_reg *reg;
>> + if (nir->options->dvec3_consumes_two_locations)
>> + reg = new(this->mem_ctx)
>> + fs_reg(ATTR, 4 * _mesa_bitcount_64(nir->info->inputs_read),
>> + BRW_REGISTER_TYPE_D);
>> + else
>> + reg = new(this->mem_ctx)
>> + fs_reg(ATTR, 4 * (_mesa_bitcount_64(nir->info->inputs_read) +
>> + _mesa_bitcount_64(nir->info->
>> double_inputs_read)),
>> + BRW_REGISTER_TYPE_D);
>> struct brw_vs_prog_data *vs_prog_data = brw_vs_prog_data(prog_data);
>>
>> switch (location) {
>> diff --git a/src/mesa/drivers/dri/i965/brw_nir.c
>> b/src/mesa/drivers/dri/i965/brw_nir.c
>> index 763e3ec..daa22c6 100644
>> --- a/src/mesa/drivers/dri/i965/brw_nir.c
>> +++ b/src/mesa/drivers/dri/i965/brw_nir.c
>> @@ -96,7 +96,7 @@ add_const_offset_to_base(nir_shader *nir,
>> nir_variable_mode mode)
>> }
>>
>> static bool
>> -remap_vs_attrs(nir_block *block, shader_info *nir_info)
>> +remap_vs_attrs(nir_block *block, nir_shader *nir)
>> {
>> nir_foreach_instr(instr, block) {
>> if (instr->type != nir_instr_type_intrinsic)
>> @@ -111,10 +111,13 @@ remap_vs_attrs(nir_block *block, shader_info
>> *nir_info)
>> * before it and counting the bits.
>> */
>> int attr = intrin->const_index[0];
>> - int slot = _mesa_bitcount_64(nir_info->inputs_read &
>> + int slot = _mesa_bitcount_64(nir->info->inputs_read &
>> + BITFIELD64_MASK(attr));
>> +
>> + int dslot = 0;
>> + if (!nir->options->dvec3_consumes_two_locations)
>> + dslot = _mesa_bitcount_64(nir->info->double_inputs_read &
>> BITFIELD64_MASK(attr));
>> - int dslot = _mesa_bitcount_64(nir_info->double_inputs_read &
>> - BITFIELD64_MASK(attr));
>> intrin->const_index[0] = 4 * (slot + dslot);
>> }
>> }
>> @@ -204,7 +207,10 @@ brw_nir_lower_vs_inputs(nir_shader *nir,
>> * loaded as one vec4 or dvec4 per element (or matrix column),
>> depending on
>> * whether it is a double-precision type or not.
>> */
>> - nir_lower_io(nir, nir_var_shader_in, type_size_vs_input, 0);
>> + nir_lower_io(nir,
>> + nir_var_shader_in,
>> + nir->options->dvec3_consumes_two_locations ?
>> type_size_vec4 : type_size_vs_input,
>> + 0);
>>
>> /* This pass needs actual constants */
>> nir_opt_constant_folding(nir);
>> @@ -220,7 +226,7 @@ brw_nir_lower_vs_inputs(nir_shader *nir,
>> nir_foreach_function(function, nir) {
>> if (function->impl) {
>> nir_foreach_block(block, function->impl) {
>> - remap_vs_attrs(block, nir->info);
>> + remap_vs_attrs(block, nir);
>> }
>> }
>> }
>> diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp
>> b/src/mesa/drivers/dri/i965/brw_vec4.cpp
>> index b9e592f..0aab97f 100644
>> --- a/src/mesa/drivers/dri/i965/brw_vec4.cpp
>> +++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp
>> @@ -2129,6 +2129,9 @@ brw_compile_vs(const struct brw_compiler *compiler,
>> void *log_data,
>>
>> unsigned nr_attributes = _mesa_bitcount_64(prog_data->inputs_read);
>>
>> + if (shader->options->dvec3_consumes_two_locations)
>> + nr_attributes -= DIV_ROUND_UP(_mesa_bitcount_64
>> (prog_data->double_inputs_read), 2);
>> +
>> /* gl_VertexID and gl_InstanceID are system values, but arrive via an
>> * incoming vertex attribute. So, add an extra slot.
>> */
>> @@ -2146,9 +2149,13 @@ brw_compile_vs(const struct brw_compiler
>> *compiler, void *log_data,
>> nr_attributes++;
>> }
>>
>> - unsigned nr_attribute_slots =
>> - nr_attributes +
>> - _mesa_bitcount_64(shader->info->double_inputs_read);
>> + unsigned nr_attribute_slots = nr_attributes;
>> + if (shader->options->dvec3_consumes_two_locations)
>> + nr_attribute_slots +=
>> + DIV_ROUND_UP(_mesa_bitcount_64(shader->info->double_inputs_read),
>> 2);
>> + else
>> + nr_attribute_slots +=
>> + _mesa_bitcount_64(shader->info->double_inputs_read);
>>
>> /* The 3DSTATE_VS documentation lists the lower bound on "Vertex URB
>> Entry
>> * Read Length" as 1 in vec4 mode, and 0 in SIMD8 mode. Empirically,
>> in
>> diff --git a/src/mesa/drivers/dri/i965/intel_screen.c
>> b/src/mesa/drivers/dri/i965/intel_screen.c
>> index e1c3c19..c050f86 100644
>> --- a/src/mesa/drivers/dri/i965/intel_screen.c
>> +++ b/src/mesa/drivers/dri/i965/intel_screen.c
>> @@ -1688,7 +1688,8 @@ __DRIconfig **intelInitScreen2(__DRIscreen
>> *dri_screen)
>> ? screenExtensions : intelRobustScreenExtensions;
>>
>> screen->compiler = brw_compiler_create(screen,
>> - &screen->devinfo);
>> + &screen->devinfo,
>> + false);
>> screen->compiler->shader_debug_log = shader_debug_log_mesa;
>> screen->compiler->shader_perf_log = shader_perf_log_mesa;
>> screen->program_id = 1;
>> --
>> 2.7.4
>>
>> _______________________________________________
>> mesa-dev mailing list
>> mesa-dev at lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
>>
>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.freedesktop.org/archives/mesa-dev/attachments/20161201/cd3c9a43/attachment-0001.html>
More information about the mesa-dev
mailing list